In [1]:
# Importing nltk
import nltk
In [2]:
# Importing inaugural corpus from nltk
from nltk.corpus import inaugural
In [3]:
# Retrieving the fileids available in inaugural
inaugural.fileids()
Out[3]:
In [4]:
# now finding out the number of words that are there in each speech
for speech in inaugural.fileids():
# Getting the length of words, inaugural.words() gives the number of words
words_count = len(inaugural.words(speech))
# printing the no of words
print('words -', words_count, '| speech -', speech)
In [5]:
# Now lets try to get the maximum and minimum no of words per speech
# we can keep the data in a tuple and call the max(), min() functions
# to retrieve the results, the same above command lets put in tuple using list compression
speech_words = [((len(inaugural.words(speech)), speech)) for speech in inaugural.fileids()]
In [6]:
max_speech_words, min_speech_words = max(speech_words), min(speech_words)
In [7]:
print('max -', max_speech_words, '| min -', min_speech_words)
In [8]:
# Now lets try to get average number of words per sentence
# average_num_of_words_per_sentence = number_of_words/number_of_sentences
# To get the num of sentences use - inaugural.sents
speech_words_and_sentences_count = [(len(inaugural.words(speech)), len(inaugural.sents(speech)), speech)
for speech in inaugural.fileids()]
In [9]:
# Now iterating through the list of tuples
for words_count, sents_count, speech in speech_words_and_sentences_count:
# Printing the average number of words per sentence and corresponding speech
print('Avg -' , words_count/sents_count, '| speech -', speech)
In [10]:
# the best way to interpret any data is to visualize it
# we use pandas dataframes to visualize the data
import pandas as pd
In [11]:
# Creating a dataframe
# speech = 2009-Obama.txt -> then speech[:4] will give the year i.e 2009
# using the speech_words_and_sentences_count which contains the words_count, sents_count, speech information
# Converting the average value into integer
data = pd.DataFrame([int(speech[:4]), int(words_count/sents_count)]
for words_count, sents_count, speech in speech_words_and_sentences_count)
In [12]:
# Lets see the first 10 values of the dataframe, year vs average
data.head(10)
Out[12]:
In [13]:
# Now chaning the column values to Year and Average WPS(Words per second)
data.columns = ['Year', 'Average WPS']
In [14]:
# Now lets see the column names changed from 0,1 to Year, Average WPS
data.head(10)
Out[14]:
In [17]:
import matplotlib
%matplotlib inline
data.plot('Year', figsize=(15, 5))
Out[17]:
In [ ]: