notebook.community

Edit and run



In [ ]:

    
# Get the Natural Language Processing Toolkit
import nltk

# Get the data science package Pandas
import pandas as pd

# Get the library matplotlib for making pretty charts
import matplotlib as plt

# Make plots appear here in this notebook
%matplotlib inline

# This just makes the plot size bigger, so that we can see it easier. 
plt.rcParams['figure.figsize'] = (12,4)

# Get all the example books from the NLTK textbook
from nltk.book import *



In [ ]:

    
# Let's explore these texts a little. 
# There are lots of things we can do with these texts. 
# To see a list, type text1. and press <Tab>
text1.collocations()



In [ ]:

    
# But what if we get tired of doing that for each text, and want to do it with all of them? 
# Put the texts into a list.
alltexts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]



In [ ]:

    
# Let's look at it to make sure it's all there. 
alltexts



In [ ]:

    
for text in alltexts: 
    text.collocations()
    print('---')



In [ ]:

    
text6.concordance('shrubbery')



In [ ]:

    
text1.dispersion_plot(['Ahab', 'Ishmael', 'whale'])



In [ ]:

    
text2.dispersion_plot(['Elinor', 'Marianne', 'Edward', 'Willoughby'])



In [ ]:

    
text6.dispersion_plot(['Ni', 'shrubbery'])



In [ ]:

    
# Let's count the words in a text
len(text1)



In [ ]:

    
# Put the texts and their wordcounts into a lookup table
lengths = {text.name: len(text) for text in alltexts}



In [ ]:

    
lengths



In [ ]:

    
pd.Series(lengths)



In [ ]:

    
pd.Series(lengths).plot(kind='bar')



In [ ]:

    
# That in itself is not very interesting. 
# So let's see if we can not only count the words, but count the vocabulary
# of a text.
# To do that, we can use `set()`, which will count every word once. 
porky_sentence = "the the the the the that's all folks"
porky_words = porky_sentence.split()
porky_words



In [ ]:

    
# We can count the words in the sentence easily: 
len(porky_words)



In [ ]:

    
# To count the words, but ignore repeated words, we can use the function set(). 
set(porky_words)



In [ ]:

    
# So if we count this set, we can determine the vocabulary of a text. 
len(set(porky_words))



In [ ]:

    
# Let's see if we can find the vocabulary of Moby Dick.
len(set(text1))



In [ ]:

    
# Pretty big, but then again, Moby Dick is kind of a long novel. 
# We can adjust for the words by adjusting for the total words: 
len(text1) / len(set(text1))



In [ ]:

    
# This would get tedious if we did this for every text, 
# so let's write a function!
def vocab(text): 
    return len(text) / len(set(text))



In [ ]:

    
vocab(porky_words)



In [ ]:

    
# Let's go through each text, and get its vocabulary, and put it in a table. 
vocabularies = {text.name: vocab(text) for text in alltexts}



In [ ]:

    
# Let's put that table into Pandas so we can see it better: 
pd.Series(vocabularies)



In [ ]:

    
# Now let's plot that. 
pd.Series(vocabularies).plot(kind='bar')



In [ ]:

    
# OK, now let's make a famous wordcloud
from wordcloud import WordCloud



In [ ]:

    
rawtext = ' '.join(text1.tokens) # Stitch it back together. 
wc = WordCloud(width=800, height=600, background_color='white')
im = wc.generate(rawtext).to_image()
plt.pyplot.imshow(im)



In [ ]:

    
# Now let's take a look at the inaugural address corpus in detail. 
from nltk.corpus import inaugural



In [ ]:

    
# Now let's set up a conditional word frequency distribution for it, 
# pairing off a list of words with the list of inaugural addresses. 
cfd = nltk.ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target))
cfd.plot()



In [ ]:

    
# Let's play around with the Brown corpus. 
# It's a categorized text corpus. Let's see all the categories: 
nltk.corpus.brown.categories()



In [ ]:

    
# Now let's create another conditional frequency distribution, 
# this time based on these genres. 
genres = ['adventure', 'romance', 'science_fiction']
words = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
              (genre, word)
              for genre in genres
              for word in nltk.corpus.brown.words(categories=genre)
              if word in words)



In [ ]:

    
cfdist



In [ ]:

    
pd.DataFrame(cfdist).T.plot(kind='bar')



In [ ]: