In [1]:
# Get the Natural Language Processing Toolkit
import nltk
# Get the data science package Pandas
import pandas as pd
# Get the library matplotlib for making pretty charts
import matplotlib as plt
# Make plots appear here in this notebook
%matplotlib inline
# This just makes the plot size bigger, so that we can see it easier.
plt.rcParams['figure.figsize'] = (12,4)
# Get all the example books from the NLTK textbook
from nltk.book import *
In [9]:
# Let's explore these texts a little.
# There are lots of things we can do with these texts.
# To see a list, type text1. and press <Tab>
text1.collocations()
In [10]:
# But what if we get tired of doing that for each text, and want to do it with all of them?
# Put the texts into a list.
alltexts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]
In [11]:
# Let's look at it to make sure it's all there.
alltexts
Out[11]:
In [12]:
for text in alltexts:
text.collocations()
print('---')
In [13]:
text6.concordance('shrubbery')
In [14]:
text1.dispersion_plot(['Ahab', 'Ishmael', 'whale'])
In [15]:
text2.dispersion_plot(['Elinor', 'Marianne', 'Edward', 'Willoughby'])
In [16]:
text6.dispersion_plot(['Ni', 'shrubbery'])
In [17]:
# Let's count the words in a text
len(text1)
Out[17]:
In [18]:
# Put the texts and their wordcounts into a lookup table
lengths = {text.name: len(text) for text in alltexts}
In [19]:
lengths
Out[19]:
In [20]:
pd.Series(lengths)
Out[20]:
In [21]:
pd.Series(lengths).plot(kind='bar')
Out[21]:
In [24]:
# That in itself is not very interesting.
# So let's see if we can not only count the words, but count the vocabulary
# of a text.
# To do that, we can use `set()`, which will count every word once.
porky_sentence = "the the the the the that's all folks"
porky_words = porky_sentence.split()
porky_words
Out[24]:
In [25]:
# We can count the words in the sentence easily:
len(porky_words)
Out[25]:
In [26]:
# To count the words, but ignore repeated words, we can use the function set().
set(porky_words)
Out[26]:
In [27]:
# So if we count this set, we can determine the vocabulary of a text.
len(set(porky_words))
Out[27]:
In [28]:
# Let's see if we can find the vocabulary of Moby Dick.
len(set(text1))
Out[28]:
In [29]:
# Pretty big, but then again, Moby Dick is kind of a long novel.
# We can adjust for the words by adjusting for the total words:
len(text1) / len(set(text1))
Out[29]:
In [30]:
# This would get tedious if we did this for every text,
# so let's write a function!
def vocab(text):
return len(text) / len(set(text))
In [31]:
vocab(porky_words)
Out[31]:
In [32]:
# Let's go through each text, and get its vocabulary, and put it in a table.
vocabularies = {text.name: vocab(text) for text in alltexts}
In [33]:
# Let's put that table into Pandas so we can see it better:
pd.Series(vocabularies)
Out[33]:
In [34]:
# Now let's plot that.
pd.Series(vocabularies).plot(kind='bar')
Out[34]:
In [35]:
# OK, now let's make a famous wordcloud
from wordcloud import WordCloud
In [45]:
rawtext = ' '.join(text1.tokens) # Stitch it back together.
wc = WordCloud(width=800, height=600, background_color='white')
im = wc.generate(rawtext).to_image()
plt.pyplot.imshow(im)
Out[45]:
In [46]:
# Now let's take a look at the inaugural address corpus in detail.
from nltk.corpus import inaugural
In [47]:
# Now let's set up a conditional word frequency distribution for it,
# pairing off a list of words with the list of inaugural addresses.
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))
cfd.plot()
In [48]:
# Let's play around with the Brown corpus.
# It's a categorized text corpus. Let's see all the categories:
nltk.corpus.brown.categories()
Out[48]:
In [50]:
# Now let's create another conditional frequency distribution,
# this time based on these genres.
genres = ['adventure', 'romance', 'science_fiction']
words = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
(genre, word)
for genre in genres
for word in nltk.corpus.brown.words(categories=genre)
if word in words)
In [51]:
cfdist
Out[51]:
In [52]:
pd.DataFrame(cfdist).T.plot(kind='bar')
Out[52]:
In [ ]: