notebook.community

Edit and run



In [1]:

    
# Importing nltk
import nltk



In [2]:

    
# Importing inaugural corpus from nltk
from nltk.corpus import inaugural



In [3]:

    
# Retrieving the fileids available in inaugural
inaugural.fileids()









    Out[3]:





['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1977-Carter.txt',
 '1981-Reagan.txt',
 '1985-Reagan.txt',
 '1989-Bush.txt',
 '1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']



In [4]:

    
# now finding out the number of words that are there in each speech
for speech in inaugural.fileids():
    # Getting the length of words, inaugural.words() gives the number of words
    words_count = len(inaugural.words(speech))
    # printing the no of words
    print('words -', words_count, '| speech -', speech)









    



words - 1538 | speech - 1789-Washington.txt
words - 147 | speech - 1793-Washington.txt
words - 2585 | speech - 1797-Adams.txt
words - 1935 | speech - 1801-Jefferson.txt
words - 2384 | speech - 1805-Jefferson.txt
words - 1265 | speech - 1809-Madison.txt
words - 1304 | speech - 1813-Madison.txt
words - 3693 | speech - 1817-Monroe.txt
words - 4909 | speech - 1821-Monroe.txt
words - 3150 | speech - 1825-Adams.txt
words - 1208 | speech - 1829-Jackson.txt
words - 1267 | speech - 1833-Jackson.txt
words - 4171 | speech - 1837-VanBuren.txt
words - 9165 | speech - 1841-Harrison.txt
words - 5196 | speech - 1845-Polk.txt
words - 1182 | speech - 1849-Taylor.txt
words - 3657 | speech - 1853-Pierce.txt
words - 3098 | speech - 1857-Buchanan.txt
words - 4005 | speech - 1861-Lincoln.txt
words - 785 | speech - 1865-Lincoln.txt
words - 1239 | speech - 1869-Grant.txt
words - 1478 | speech - 1873-Grant.txt
words - 2724 | speech - 1877-Hayes.txt
words - 3239 | speech - 1881-Garfield.txt
words - 1828 | speech - 1885-Cleveland.txt
words - 4750 | speech - 1889-Harrison.txt
words - 2153 | speech - 1893-Cleveland.txt
words - 4371 | speech - 1897-McKinley.txt
words - 2450 | speech - 1901-McKinley.txt
words - 1091 | speech - 1905-Roosevelt.txt
words - 5846 | speech - 1909-Taft.txt
words - 1905 | speech - 1913-Wilson.txt
words - 1656 | speech - 1917-Wilson.txt
words - 3756 | speech - 1921-Harding.txt
words - 4442 | speech - 1925-Coolidge.txt
words - 3890 | speech - 1929-Hoover.txt
words - 2063 | speech - 1933-Roosevelt.txt
words - 2019 | speech - 1937-Roosevelt.txt
words - 1536 | speech - 1941-Roosevelt.txt
words - 637 | speech - 1945-Roosevelt.txt
words - 2528 | speech - 1949-Truman.txt
words - 2775 | speech - 1953-Eisenhower.txt
words - 1917 | speech - 1957-Eisenhower.txt
words - 1546 | speech - 1961-Kennedy.txt
words - 1715 | speech - 1965-Johnson.txt
words - 2425 | speech - 1969-Nixon.txt
words - 2028 | speech - 1973-Nixon.txt
words - 1380 | speech - 1977-Carter.txt
words - 2801 | speech - 1981-Reagan.txt
words - 2946 | speech - 1985-Reagan.txt
words - 2713 | speech - 1989-Bush.txt
words - 1855 | speech - 1993-Clinton.txt
words - 2462 | speech - 1997-Clinton.txt
words - 1825 | speech - 2001-Bush.txt
words - 2376 | speech - 2005-Bush.txt
words - 2726 | speech - 2009-Obama.txt



In [5]:

    
# Now lets try to get the maximum and minimum no of words per speech
# we can keep the data in a tuple and call the max(), min() functions
# to retrieve the results, the same above command lets put in tuple using list compression
speech_words = [((len(inaugural.words(speech)), speech)) for speech in inaugural.fileids()]



In [6]:

    
max_speech_words, min_speech_words = max(speech_words), min(speech_words)



In [7]:

    
print('max -', max_speech_words, '| min -', min_speech_words)









    



max - (9165, '1841-Harrison.txt') | min - (147, '1793-Washington.txt')



In [8]:

    
# Now lets try to get average number of words per sentence
# average_num_of_words_per_sentence = number_of_words/number_of_sentences
# To get the num of sentences use -  inaugural.sents
speech_words_and_sentences_count = [(len(inaugural.words(speech)), len(inaugural.sents(speech)), speech) 
                             for speech in inaugural.fileids()]



In [9]:

    
# Now iterating through the list of tuples
for words_count, sents_count, speech in speech_words_and_sentences_count:
    # Printing the average number of words per sentence and corresponding speech
    print('Avg -' , words_count/sents_count, '| speech -', speech)









    



Avg - 64.08333333333333 | speech - 1789-Washington.txt
Avg - 36.75 | speech - 1793-Washington.txt
Avg - 69.86486486486487 | speech - 1797-Adams.txt
Avg - 46.07142857142857 | speech - 1801-Jefferson.txt
Avg - 52.977777777777774 | speech - 1805-Jefferson.txt
Avg - 60.23809523809524 | speech - 1809-Madison.txt
Avg - 39.515151515151516 | speech - 1813-Madison.txt
Avg - 30.270491803278688 | speech - 1817-Monroe.txt
Avg - 38.054263565891475 | speech - 1821-Monroe.txt
Avg - 42.567567567567565 | speech - 1825-Adams.txt
Avg - 48.32 | speech - 1829-Jackson.txt
Avg - 42.233333333333334 | speech - 1833-Jackson.txt
Avg - 43.90526315789474 | speech - 1837-VanBuren.txt
Avg - 43.642857142857146 | speech - 1841-Harrison.txt
Avg - 33.96078431372549 | speech - 1845-Polk.txt
Avg - 53.72727272727273 | speech - 1849-Taylor.txt
Avg - 35.16346153846154 | speech - 1853-Pierce.txt
Avg - 34.80898876404494 | speech - 1857-Buchanan.txt
Avg - 29.02173913043478 | speech - 1861-Lincoln.txt
Avg - 29.074074074074073 | speech - 1865-Lincoln.txt
Avg - 30.21951219512195 | speech - 1869-Grant.txt
Avg - 33.59090909090909 | speech - 1873-Grant.txt
Avg - 46.16949152542373 | speech - 1877-Hayes.txt
Avg - 28.919642857142858 | speech - 1881-Garfield.txt
Avg - 41.54545454545455 | speech - 1885-Cleveland.txt
Avg - 30.254777070063696 | speech - 1889-Harrison.txt
Avg - 37.12068965517241 | speech - 1893-Cleveland.txt
Avg - 33.62307692307692 | speech - 1897-McKinley.txt
Avg - 24.5 | speech - 1901-McKinley.txt
Avg - 33.06060606060606 | speech - 1905-Roosevelt.txt
Avg - 36.76729559748428 | speech - 1909-Taft.txt
Avg - 28.014705882352942 | speech - 1913-Wilson.txt
Avg - 27.6 | speech - 1917-Wilson.txt
Avg - 25.20805369127517 | speech - 1921-Harding.txt
Avg - 22.548223350253807 | speech - 1925-Coolidge.txt
Avg - 24.620253164556964 | speech - 1929-Hoover.txt
Avg - 24.270588235294117 | speech - 1933-Roosevelt.txt
Avg - 21.03125 | speech - 1937-Roosevelt.txt
Avg - 22.58823529411765 | speech - 1941-Roosevelt.txt
Avg - 24.5 | speech - 1945-Roosevelt.txt
Avg - 21.79310344827586 | speech - 1949-Truman.txt
Avg - 22.5609756097561 | speech - 1953-Eisenhower.txt
Avg - 20.83695652173913 | speech - 1957-Eisenhower.txt
Avg - 29.73076923076923 | speech - 1961-Kennedy.txt
Avg - 18.24468085106383 | speech - 1965-Johnson.txt
Avg - 22.87735849056604 | speech - 1969-Nixon.txt
Avg - 29.391304347826086 | speech - 1973-Nixon.txt
Avg - 26.037735849056602 | speech - 1977-Carter.txt
Avg - 22.055118110236222 | speech - 1981-Reagan.txt
Avg - 23.38095238095238 | speech - 1985-Reagan.txt
Avg - 18.71034482758621 | speech - 1989-Bush.txt
Avg - 22.901234567901234 | speech - 1993-Clinton.txt
Avg - 21.982142857142858 | speech - 1997-Clinton.txt
Avg - 18.814432989690722 | speech - 2001-Bush.txt
Avg - 25.010526315789473 | speech - 2005-Bush.txt
Avg - 24.339285714285715 | speech - 2009-Obama.txt



In [10]:

    
# the best way to interpret any data is to visualize it
# we use pandas dataframes to visualize the data
import pandas as pd



In [11]:

    
# Creating a dataframe
# speech = 2009-Obama.txt -> then speech[:4] will give the year i.e 2009
# using the speech_words_and_sentences_count which contains the words_count, sents_count, speech information
# Converting the average value into integer
data = pd.DataFrame([int(speech[:4]), int(words_count/sents_count)] 
                    for words_count, sents_count, speech in speech_words_and_sentences_count)



In [12]:

    
# Lets see the first 10 values of the dataframe, year vs average
data.head(10)



In [13]:

    
# Now chaning the column values to Year and Average WPS(Words per second)
data.columns = ['Year', 'Average WPS']



In [14]:

    
# Now lets see the column names changed from 0,1 to Year, Average WPS
data.head(10)









    Out[14]:







  
    
      
      Year
      Average WPS
    
  
  
    
      0
      1789
      64
    
    
      1
      1793
      36
    
    
      2
      1797
      69
    
    
      3
      1801
      46
    
    
      4
      1805
      52
    
    
      5
      1809
      60
    
    
      6
      1813
      39
    
    
      7
      1817
      30
    
    
      8
      1821
      38
    
    
      9
      1825
      42



In [17]:

    
import matplotlib
%matplotlib inline
data.plot('Year', figsize=(15, 5))









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0xd79b626e48>



In [ ]:

	0	1
0	1789	64
1	1793	36
2	1797	69
3	1801	46
4	1805	52
5	1809	60
6	1813	39
7	1817	30
8	1821	38
9	1825	42