In [ ]:
# Get the Natural Language Processing Toolkit
import nltk

# Get the data science package Pandas
import pandas as pd

# Get the library matplotlib for making pretty charts
import matplotlib as plt

# Make plots appear here in this notebook
%matplotlib inline

# This just makes the plot size bigger, so that we can see it easier. 
plt.rcParams['figure.figsize'] = (12,4)

# Get all the example books from the NLTK textbook
from nltk.book import *

In [ ]:
# Let's explore these texts a little. 
# There are lots of things we can do with these texts. 
# To see a list, type text1. and press <Tab>
text1.collocations()

In [ ]:
# But what if we get tired of doing that for each text, and want to do it with all of them? 
# Put the texts into a list.
alltexts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]

In [ ]:
# Let's look at it to make sure it's all there. 
alltexts

In [ ]:
for text in alltexts: 
    text.collocations()
    print('---')

In [ ]:
text6.concordance('shrubbery')

In [ ]:
text1.dispersion_plot(['Ahab', 'Ishmael', 'whale'])

In [ ]:
text2.dispersion_plot(['Elinor', 'Marianne', 'Edward', 'Willoughby'])

In [ ]:
text6.dispersion_plot(['Ni', 'shrubbery'])

In [ ]:
# Let's count the words in a text
len(text1)

In [ ]:
# Put the texts and their wordcounts into a lookup table
lengths = {text.name: len(text) for text in alltexts}

In [ ]:
lengths

In [ ]:
pd.Series(lengths)

In [ ]:
pd.Series(lengths).plot(kind='bar')

In [ ]:
# That in itself is not very interesting. 
# So let's see if we can not only count the words, but count the vocabulary
# of a text.
# To do that, we can use `set()`, which will count every word once. 
porky_sentence = "the the the the the that's all folks"
porky_words = porky_sentence.split()
porky_words

In [ ]:
# We can count the words in the sentence easily: 
len(porky_words)

In [ ]:
# To count the words, but ignore repeated words, we can use the function set(). 
set(porky_words)

In [ ]:
# So if we count this set, we can determine the vocabulary of a text. 
len(set(porky_words))

In [ ]:
# Let's see if we can find the vocabulary of Moby Dick.
len(set(text1))

In [ ]:
# Pretty big, but then again, Moby Dick is kind of a long novel. 
# We can adjust for the words by adjusting for the total words: 
len(text1) / len(set(text1))

In [ ]:
# This would get tedious if we did this for every text, 
# so let's write a function!
def vocab(text): 
    return len(text) / len(set(text))

In [ ]:
vocab(porky_words)

In [ ]:
# Let's go through each text, and get its vocabulary, and put it in a table. 
vocabularies = {text.name: vocab(text) for text in alltexts}

In [ ]:
# Let's put that table into Pandas so we can see it better: 
pd.Series(vocabularies)

In [ ]:
# Now let's plot that. 
pd.Series(vocabularies).plot(kind='bar')

In [ ]:
# OK, now let's make a famous wordcloud
from wordcloud import WordCloud

In [ ]:
rawtext = ' '.join(text1.tokens) # Stitch it back together. 
wc = WordCloud(width=800, height=600, background_color='white')
im = wc.generate(rawtext).to_image()
plt.pyplot.imshow(im)

In [ ]:
# Now let's take a look at the inaugural address corpus in detail. 
from nltk.corpus import inaugural

In [ ]:
# Now let's set up a conditional word frequency distribution for it, 
# pairing off a list of words with the list of inaugural addresses. 
cfd = nltk.ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target))
cfd.plot()

In [ ]:
# Let's play around with the Brown corpus. 
# It's a categorized text corpus. Let's see all the categories: 
nltk.corpus.brown.categories()

In [ ]:
# Now let's create another conditional frequency distribution, 
# this time based on these genres. 
genres = ['adventure', 'romance', 'science_fiction']
words = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
              (genre, word)
              for genre in genres
              for word in nltk.corpus.brown.words(categories=genre)
              if word in words)

In [ ]:
cfdist

In [ ]:
pd.DataFrame(cfdist).T.plot(kind='bar')

In [ ]: