In [1]:
import numpy as np
import matplotlib.pyplot as plt
from nltk.book import text7 as text
import nltk
import string
In [2]:
print('Number of words', len(text))
In [3]:
text[0:10]
Out[3]:
As it can seen from the text above the text is only a list of words. Let's count the number of words and then concatenate to count the number of actual letters.
In [4]:
print('Number of words', len(text))
In [5]:
concatenated_text = ' '.join(text)
In [6]:
concatenated_text[0:50] # This is an ugly way to concatenate
Out[6]:
In [7]:
distribution = nltk.FreqDist(concatenated_text)
In [8]:
# fig = plt.gcf()
# fig.set_size_inches((32, 24))
distribution.plot()
In [9]:
distribution.pprint()
We see that there are a lot of strange characters. We are going to transform to lowercase all the letters and then get the distribution again
In [11]:
text_lowercase = [letter.lower() for letter in concatenated_text]
In [12]:
distribution_low = nltk.FreqDist(text_lowercase)
In [13]:
distribution_low.pprint()
In [14]:
import seaborn as sns
distribution_low.plot()
In [15]:
distribution_low.keys()
Out[15]:
In [28]:
bigrams = nltk.bigrams(text_lowercase)
In [29]:
bigrams_freq = nltk.FreqDist(bigrams)
This however contains pairs with whitespaces as the most common frequency.
In [30]:
bigrams_freq.most_common(10)
Out[30]:
In [37]:
bigrams_freq.plot(20)
We can remove them by using a standar dictionary looping technique
In [32]:
bigrams_dist_without_space = {key:value for key,value in bigrams_freq.items() if ' ' not in key}
In [33]:
bigrams_dist_without_space
Out[33]:
The problem with this approach is that we end up with a dictionary and not an nltk FreqDist object. Therefore we are deprived of handy methods like plot and most_common.
Another approach is to first remove the ones with the white space from the bigrams and then construct a FreqDist that will already not count the spaces.
In [34]:
bigrams = nltk.bigrams(text_lowercase)
bigrams_without_space = [bigram for bigram in bigrams if ' ' not in bigram]
In [36]:
bigrams_frequency = nltk.FreqDist(bigrams_without_space)
bigrams_frequency.plot(10)
In [ ]: