In [1]:
# Get the Natural Language Processing Toolkit
import nltk

# Get the data science package Pandas
import pandas as pd

# Get the library matplotlib for making pretty charts
import matplotlib as plt

# Make plots appear here in this notebook
%matplotlib inline

# This just makes the plot size bigger, so that we can see it easier. 
plt.rcParams['figure.figsize'] = (12,4)

# Get all the example books from the NLTK textbook
from nltk.book import *


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

In [9]:
# Let's explore these texts a little. 
# There are lots of things we can do with these texts. 
# To see a list, type text1. and press <Tab>
text1.collocations()


Sperm Whale; Moby Dick; White Whale; old man; Captain Ahab; sperm
whale; Right Whale; Captain Peleg; New Bedford; Cape Horn; cried Ahab;
years ago; lower jaw; never mind; Father Mapple; cried Stubb; chief
mate; white whale; ivory leg; one hand

In [10]:
# But what if we get tired of doing that for each text, and want to do it with all of them? 
# Put the texts into a list.
alltexts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]

In [11]:
# Let's look at it to make sure it's all there. 
alltexts


Out[11]:
[<Text: Moby Dick by Herman Melville 1851>,
 <Text: Sense and Sensibility by Jane Austen 1811>,
 <Text: The Book of Genesis>,
 <Text: Inaugural Address Corpus>,
 <Text: Chat Corpus>,
 <Text: Monty Python and the Holy Grail>,
 <Text: Wall Street Journal>,
 <Text: Personals Corpus>,
 <Text: The Man Who Was Thursday by G . K . Chesterton 1908>]

In [12]:
for text in alltexts: 
    text.collocations()
    print('---')


Sperm Whale; Moby Dick; White Whale; old man; Captain Ahab; sperm
whale; Right Whale; Captain Peleg; New Bedford; Cape Horn; cried Ahab;
years ago; lower jaw; never mind; Father Mapple; cried Stubb; chief
mate; white whale; ivory leg; one hand
---
Colonel Brandon; Sir John; Lady Middleton; Miss Dashwood; every thing;
thousand pounds; dare say; Miss Steeles; said Elinor; Miss Steele;
every body; John Dashwood; great deal; Harley Street; Berkeley Street;
Miss Dashwoods; young man; Combe Magna; every day; next morning
---
said unto; pray thee; thou shalt; thou hast; thy seed; years old;
spake unto; thou art; LORD God; every living; God hath; begat sons;
seven years; shalt thou; little ones; living creature; creeping thing;
savoury meat; thirty years; every beast
---
United States; fellow citizens; four years; years ago; Federal
Government; General Government; American people; Vice President; Old
World; Almighty God; Fellow citizens; Chief Magistrate; Chief Justice;
God bless; every citizen; Indian tribes; public debt; one another;
foreign nations; political parties
---
wanna chat; PART JOIN; MODE #14-19teens; JOIN PART; PART PART;
cute.-ass MP3; MP3 player; JOIN JOIN; times .. .; ACTION watches; guys
wanna; song lasts; last night; ACTION sits; -...)...- S.M.R.; Lime
Player; Player 12%; dont know; lez gurls; long time
---
BLACK KNIGHT; clop clop; HEAD KNIGHT; mumble mumble; Holy Grail;
squeak squeak; FRENCH GUARD; saw saw; Sir Robin; Run away; CARTOON
CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD PERSON; Round
Table; clap clap; OLD MAN; dramatic chord; dona eis
---
million *U*; New York; billion *U*; Wall Street; program trading; Mrs.
Yeargin; vice president; Stock Exchange; Big Board; Georgia Gulf;
chief executive; Dow Jones; S&P 500; says *T*-1; York Stock; last
year; Sea Containers; South Korea; American Express; San Francisco
---
would like; medium build; social drinker; quiet nights; non smoker;
long term; age open; Would like; easy going; financially secure; fun
times; similar interests; Age open; weekends away; poss rship; well
presented; never married; single mum; permanent relationship; slim
build
---
said Syme; asked Syme; Saffron Park; Comrade Gregory; Leicester
Square; Colonel Ducroix; red hair; old gentleman; could see; Inspector
Ratcliffe; Anarchist Council; blue card; Scotland Yard; dark room;
blue eyes; common sense; straw hat; hundred yards; said Gregory; run
away
---

In [13]:
text6.concordance('shrubbery')


Displaying 13 of 13 matches:
want ? HEAD KNIGHT : We want ... a shrubbery ! [ dramatic chord ] ARTHUR : A wh
ase ! No more ! We will find you a shrubbery . HEAD KNIGHT : You must return he
IGHT : You must return here with a shrubbery or else you will never pass throug
d fair , and we will return with a shrubbery . HEAD KNIGHT : One that looks nic
 in this town where we could buy a shrubbery ? [ dramatic chord ] OLD CRONE : W
 do not tell us where we can buy a shrubbery , my friend and I will say ... we 
s of Ni , we have brought you your shrubbery . May we go now ? HEAD KNIGHT : It
o now ? HEAD KNIGHT : It is a good shrubbery . I like the laurels particularly 
irstly , you must find ... another shrubbery ! [ dramatic chord ] ARTHUR : Not 
matic chord ] ARTHUR : Not another shrubbery ! RANDOM : Ni ! HEAD KNIGHT : Then
T : Then , when you have found the shrubbery , you must place it here beside th
you must place it here beside this shrubbery , only slightly higher so you get 
T : Then , when you have found the shrubbery , you must cut down the mightiest 

In [14]:
text1.dispersion_plot(['Ahab', 'Ishmael', 'whale'])



In [15]:
text2.dispersion_plot(['Elinor', 'Marianne', 'Edward', 'Willoughby'])



In [16]:
text6.dispersion_plot(['Ni', 'shrubbery'])



In [17]:
# Let's count the words in a text
len(text1)


Out[17]:
260819

In [18]:
# Put the texts and their wordcounts into a lookup table
lengths = {text.name: len(text) for text in alltexts}

In [19]:
lengths


Out[19]:
{'Chat Corpus': 45010,
 'Inaugural Address Corpus': 145735,
 'Moby Dick by Herman Melville 1851': 260819,
 'Monty Python and the Holy Grail': 16967,
 'Personals Corpus': 4867,
 'Sense and Sensibility by Jane Austen 1811': 141576,
 'The Book of Genesis': 44764,
 'The Man Who Was Thursday by G . K . Chesterton 1908': 69213,
 'Wall Street Journal': 100676}

In [20]:
pd.Series(lengths)


Out[20]:
Chat Corpus                                             45010
Inaugural Address Corpus                               145735
Moby Dick by Herman Melville 1851                      260819
Monty Python and the Holy Grail                         16967
Personals Corpus                                         4867
Sense and Sensibility by Jane Austen 1811              141576
The Book of Genesis                                     44764
The Man Who Was Thursday by G . K . Chesterton 1908     69213
Wall Street Journal                                    100676
dtype: int64

In [21]:
pd.Series(lengths).plot(kind='bar')


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3beaa5a7b8>

In [24]:
# That in itself is not very interesting. 
# So let's see if we can not only count the words, but count the vocabulary
# of a text.
# To do that, we can use `set()`, which will count every word once. 
porky_sentence = "the the the the the that's all folks"
porky_words = porky_sentence.split()
porky_words


Out[24]:
['the', 'the', 'the', 'the', 'the', "that's", 'all', 'folks']

In [25]:
# We can count the words in the sentence easily: 
len(porky_words)


Out[25]:
8

In [26]:
# To count the words, but ignore repeated words, we can use the function set(). 
set(porky_words)


Out[26]:
{'all', 'folks', "that's", 'the'}

In [27]:
# So if we count this set, we can determine the vocabulary of a text. 
len(set(porky_words))


Out[27]:
4

In [28]:
# Let's see if we can find the vocabulary of Moby Dick.
len(set(text1))


Out[28]:
19317

In [29]:
# Pretty big, but then again, Moby Dick is kind of a long novel. 
# We can adjust for the words by adjusting for the total words: 
len(text1) / len(set(text1))


Out[29]:
13.502044830977896

In [30]:
# This would get tedious if we did this for every text, 
# so let's write a function!
def vocab(text): 
    return len(text) / len(set(text))

In [31]:
vocab(porky_words)


Out[31]:
2.0

In [32]:
# Let's go through each text, and get its vocabulary, and put it in a table. 
vocabularies = {text.name: vocab(text) for text in alltexts}

In [33]:
# Let's put that table into Pandas so we can see it better: 
pd.Series(vocabularies)


Out[33]:
Chat Corpus                                             7.420046
Inaugural Address Corpus                               14.941050
Moby Dick by Herman Melville 1851                      13.502045
Monty Python and the Holy Grail                         7.833333
Personals Corpus                                        4.392599
Sense and Sensibility by Jane Austen 1811              20.719450
The Book of Genesis                                    16.050197
The Man Who Was Thursday by G . K . Chesterton 1908    10.167915
Wall Street Journal                                     8.113798
dtype: float64

In [34]:
# Now let's plot that. 
pd.Series(vocabularies).plot(kind='bar')


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3bead190f0>

In [35]:
# OK, now let's make a famous wordcloud
from wordcloud import WordCloud

In [45]:
rawtext = ' '.join(text1.tokens) # Stitch it back together. 
wc = WordCloud(width=800, height=600, background_color='white')
im = wc.generate(rawtext).to_image()
plt.pyplot.imshow(im)


/usr/lib/python3.5/site-packages/PIL/ImageDraw.py:99: UserWarning: setfont() is deprecated. Please set the attribute directly instead.
  "Please set the attribute directly instead.")
Out[45]:
<matplotlib.image.AxesImage at 0x7f3beac200b8>

In [46]:
# Now let's take a look at the inaugural address corpus in detail. 
from nltk.corpus import inaugural

In [47]:
# Now let's set up a conditional word frequency distribution for it, 
# pairing off a list of words with the list of inaugural addresses. 
cfd = nltk.ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target))
cfd.plot()



In [48]:
# Let's play around with the Brown corpus. 
# It's a categorized text corpus. Let's see all the categories: 
nltk.corpus.brown.categories()


Out[48]:
['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [50]:
# Now let's create another conditional frequency distribution, 
# this time based on these genres. 
genres = ['adventure', 'romance', 'science_fiction']
words = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
              (genre, word)
              for genre in genres
              for word in nltk.corpus.brown.words(categories=genre)
              if word in words)

In [51]:
cfdist


Out[51]:
ConditionalFreqDist(nltk.probability.FreqDist,
                    {'adventure': FreqDist({'can': 46,
                               'could': 151,
                               'may': 5,
                               'might': 58,
                               'must': 27,
                               'will': 50}),
                     'romance': FreqDist({'can': 74,
                               'could': 193,
                               'may': 11,
                               'might': 51,
                               'must': 45,
                               'will': 43}),
                     'science_fiction': FreqDist({'can': 16,
                               'could': 49,
                               'may': 4,
                               'might': 12,
                               'must': 8,
                               'will': 16})})

In [52]:
pd.DataFrame(cfdist).T.plot(kind='bar')


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3be80d8668>

In [ ]: