In [1]:
# Importing nltk
import nltk

In [2]:
# Importing inaugural corpus from nltk
from nltk.corpus import inaugural

In [3]:
# Retrieving the fileids available in inaugural
inaugural.fileids()


Out[3]:
['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1977-Carter.txt',
 '1981-Reagan.txt',
 '1985-Reagan.txt',
 '1989-Bush.txt',
 '1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']

In [4]:
# now finding out the number of words that are there in each speech
for speech in inaugural.fileids():
    # Getting the length of words, inaugural.words() gives the number of words
    words_count = len(inaugural.words(speech))
    # printing the no of words
    print('words -', words_count, '| speech -', speech)


words - 1538 | speech - 1789-Washington.txt
words - 147 | speech - 1793-Washington.txt
words - 2585 | speech - 1797-Adams.txt
words - 1935 | speech - 1801-Jefferson.txt
words - 2384 | speech - 1805-Jefferson.txt
words - 1265 | speech - 1809-Madison.txt
words - 1304 | speech - 1813-Madison.txt
words - 3693 | speech - 1817-Monroe.txt
words - 4909 | speech - 1821-Monroe.txt
words - 3150 | speech - 1825-Adams.txt
words - 1208 | speech - 1829-Jackson.txt
words - 1267 | speech - 1833-Jackson.txt
words - 4171 | speech - 1837-VanBuren.txt
words - 9165 | speech - 1841-Harrison.txt
words - 5196 | speech - 1845-Polk.txt
words - 1182 | speech - 1849-Taylor.txt
words - 3657 | speech - 1853-Pierce.txt
words - 3098 | speech - 1857-Buchanan.txt
words - 4005 | speech - 1861-Lincoln.txt
words - 785 | speech - 1865-Lincoln.txt
words - 1239 | speech - 1869-Grant.txt
words - 1478 | speech - 1873-Grant.txt
words - 2724 | speech - 1877-Hayes.txt
words - 3239 | speech - 1881-Garfield.txt
words - 1828 | speech - 1885-Cleveland.txt
words - 4750 | speech - 1889-Harrison.txt
words - 2153 | speech - 1893-Cleveland.txt
words - 4371 | speech - 1897-McKinley.txt
words - 2450 | speech - 1901-McKinley.txt
words - 1091 | speech - 1905-Roosevelt.txt
words - 5846 | speech - 1909-Taft.txt
words - 1905 | speech - 1913-Wilson.txt
words - 1656 | speech - 1917-Wilson.txt
words - 3756 | speech - 1921-Harding.txt
words - 4442 | speech - 1925-Coolidge.txt
words - 3890 | speech - 1929-Hoover.txt
words - 2063 | speech - 1933-Roosevelt.txt
words - 2019 | speech - 1937-Roosevelt.txt
words - 1536 | speech - 1941-Roosevelt.txt
words - 637 | speech - 1945-Roosevelt.txt
words - 2528 | speech - 1949-Truman.txt
words - 2775 | speech - 1953-Eisenhower.txt
words - 1917 | speech - 1957-Eisenhower.txt
words - 1546 | speech - 1961-Kennedy.txt
words - 1715 | speech - 1965-Johnson.txt
words - 2425 | speech - 1969-Nixon.txt
words - 2028 | speech - 1973-Nixon.txt
words - 1380 | speech - 1977-Carter.txt
words - 2801 | speech - 1981-Reagan.txt
words - 2946 | speech - 1985-Reagan.txt
words - 2713 | speech - 1989-Bush.txt
words - 1855 | speech - 1993-Clinton.txt
words - 2462 | speech - 1997-Clinton.txt
words - 1825 | speech - 2001-Bush.txt
words - 2376 | speech - 2005-Bush.txt
words - 2726 | speech - 2009-Obama.txt

In [5]:
# Now lets try to get the maximum and minimum no of words per speech
# we can keep the data in a tuple and call the max(), min() functions
# to retrieve the results, the same above command lets put in tuple using list compression
speech_words = [((len(inaugural.words(speech)), speech)) for speech in inaugural.fileids()]

In [6]:
max_speech_words, min_speech_words = max(speech_words), min(speech_words)

In [7]:
print('max -', max_speech_words, '| min -', min_speech_words)


max - (9165, '1841-Harrison.txt') | min - (147, '1793-Washington.txt')

In [8]:
# Now lets try to get average number of words per sentence
# average_num_of_words_per_sentence = number_of_words/number_of_sentences
# To get the num of sentences use -  inaugural.sents
speech_words_and_sentences_count = [(len(inaugural.words(speech)), len(inaugural.sents(speech)), speech) 
                             for speech in inaugural.fileids()]

In [9]:
# Now iterating through the list of tuples
for words_count, sents_count, speech in speech_words_and_sentences_count:
    # Printing the average number of words per sentence and corresponding speech
    print('Avg -' , words_count/sents_count, '| speech -', speech)


Avg - 64.08333333333333 | speech - 1789-Washington.txt
Avg - 36.75 | speech - 1793-Washington.txt
Avg - 69.86486486486487 | speech - 1797-Adams.txt
Avg - 46.07142857142857 | speech - 1801-Jefferson.txt
Avg - 52.977777777777774 | speech - 1805-Jefferson.txt
Avg - 60.23809523809524 | speech - 1809-Madison.txt
Avg - 39.515151515151516 | speech - 1813-Madison.txt
Avg - 30.270491803278688 | speech - 1817-Monroe.txt
Avg - 38.054263565891475 | speech - 1821-Monroe.txt
Avg - 42.567567567567565 | speech - 1825-Adams.txt
Avg - 48.32 | speech - 1829-Jackson.txt
Avg - 42.233333333333334 | speech - 1833-Jackson.txt
Avg - 43.90526315789474 | speech - 1837-VanBuren.txt
Avg - 43.642857142857146 | speech - 1841-Harrison.txt
Avg - 33.96078431372549 | speech - 1845-Polk.txt
Avg - 53.72727272727273 | speech - 1849-Taylor.txt
Avg - 35.16346153846154 | speech - 1853-Pierce.txt
Avg - 34.80898876404494 | speech - 1857-Buchanan.txt
Avg - 29.02173913043478 | speech - 1861-Lincoln.txt
Avg - 29.074074074074073 | speech - 1865-Lincoln.txt
Avg - 30.21951219512195 | speech - 1869-Grant.txt
Avg - 33.59090909090909 | speech - 1873-Grant.txt
Avg - 46.16949152542373 | speech - 1877-Hayes.txt
Avg - 28.919642857142858 | speech - 1881-Garfield.txt
Avg - 41.54545454545455 | speech - 1885-Cleveland.txt
Avg - 30.254777070063696 | speech - 1889-Harrison.txt
Avg - 37.12068965517241 | speech - 1893-Cleveland.txt
Avg - 33.62307692307692 | speech - 1897-McKinley.txt
Avg - 24.5 | speech - 1901-McKinley.txt
Avg - 33.06060606060606 | speech - 1905-Roosevelt.txt
Avg - 36.76729559748428 | speech - 1909-Taft.txt
Avg - 28.014705882352942 | speech - 1913-Wilson.txt
Avg - 27.6 | speech - 1917-Wilson.txt
Avg - 25.20805369127517 | speech - 1921-Harding.txt
Avg - 22.548223350253807 | speech - 1925-Coolidge.txt
Avg - 24.620253164556964 | speech - 1929-Hoover.txt
Avg - 24.270588235294117 | speech - 1933-Roosevelt.txt
Avg - 21.03125 | speech - 1937-Roosevelt.txt
Avg - 22.58823529411765 | speech - 1941-Roosevelt.txt
Avg - 24.5 | speech - 1945-Roosevelt.txt
Avg - 21.79310344827586 | speech - 1949-Truman.txt
Avg - 22.5609756097561 | speech - 1953-Eisenhower.txt
Avg - 20.83695652173913 | speech - 1957-Eisenhower.txt
Avg - 29.73076923076923 | speech - 1961-Kennedy.txt
Avg - 18.24468085106383 | speech - 1965-Johnson.txt
Avg - 22.87735849056604 | speech - 1969-Nixon.txt
Avg - 29.391304347826086 | speech - 1973-Nixon.txt
Avg - 26.037735849056602 | speech - 1977-Carter.txt
Avg - 22.055118110236222 | speech - 1981-Reagan.txt
Avg - 23.38095238095238 | speech - 1985-Reagan.txt
Avg - 18.71034482758621 | speech - 1989-Bush.txt
Avg - 22.901234567901234 | speech - 1993-Clinton.txt
Avg - 21.982142857142858 | speech - 1997-Clinton.txt
Avg - 18.814432989690722 | speech - 2001-Bush.txt
Avg - 25.010526315789473 | speech - 2005-Bush.txt
Avg - 24.339285714285715 | speech - 2009-Obama.txt

In [10]:
# the best way to interpret any data is to visualize it
# we use pandas dataframes to visualize the data
import pandas as pd

In [11]:
# Creating a dataframe
# speech = 2009-Obama.txt -> then speech[:4] will give the year i.e 2009
# using the speech_words_and_sentences_count which contains the words_count, sents_count, speech information
# Converting the average value into integer
data = pd.DataFrame([int(speech[:4]), int(words_count/sents_count)] 
                    for words_count, sents_count, speech in speech_words_and_sentences_count)

In [12]:
# Lets see the first 10 values of the dataframe, year vs average
data.head(10)


Out[12]:
0 1
0 1789 64
1 1793 36
2 1797 69
3 1801 46
4 1805 52
5 1809 60
6 1813 39
7 1817 30
8 1821 38
9 1825 42

In [13]:
# Now chaning the column values to Year and Average WPS(Words per second)
data.columns = ['Year', 'Average WPS']

In [14]:
# Now lets see the column names changed from 0,1 to Year, Average WPS
data.head(10)


Out[14]:
Year Average WPS
0 1789 64
1 1793 36
2 1797 69
3 1801 46
4 1805 52
5 1809 60
6 1813 39
7 1817 30
8 1821 38
9 1825 42

In [17]:
import matplotlib
%matplotlib inline
data.plot('Year', figsize=(15, 5))


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0xd79b626e48>

In [ ]: