In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
In [2]:
from nltk.book import *
In [3]:
text1
Out[3]:
In [4]:
text2
Out[4]:
concordance
is a view that shows every occurrence of a word alongside some context
In [5]:
text1.concordance("monstrous")
In [6]:
text2.concordance("affection")
In [7]:
text3.concordance("lived")
similar
shows other words that appear in a similar context to the entered word
In [8]:
text1.similar("monstrous")
In [9]:
text2.similar("monstrous")
text 1 (Melville) uses monstrous very differently from text 2 (Austen)
common_contexts
shows contexts that are shared by two or more words
In [10]:
text2.common_contexts(["monstrous", "very"])
trying out other words...
In [11]:
text2.similar("affection")
In [12]:
text2.common_contexts(["affection", "regard"])
Lexical Dispersion Plot
Determining the location of words in text (how many words from beginning does this word appear?) -- using dispersion_plot
In [44]:
plt.figure(figsize=(18,10))
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America", "liberty", "constitution"])
Generating some random text in the style of text3 -- using generate()
not yet supported in NLTK 3.0
In [14]:
# (not available in NLTK 3.0)
# text3.generate()
Count the number of tokens using len
In [15]:
len(text3)
Out[15]:
View/count vocabulary using set(text_obj)
In [16]:
len(set(text3))
Out[16]:
In [17]:
# first 50
sorted(set(text3))[:50]
Out[17]:
Calculating lexical richness of the text
In [18]:
len(set(text3)) / len(text3)
Out[18]:
Count how often a word occurs in the text
In [19]:
text3.count("smote")
Out[19]:
Compute what percentage of the text is taken up by a specific word
In [20]:
100 * text4.count('a') / len(text4)
Out[20]:
In [21]:
text5.count('lol')
Out[21]:
In [22]:
100 * text5.count('lol') / len(text5)
Out[22]:
Define some simple functions to calculate these values
In [23]:
def lexical_diversity(text):
return len(set(text)) / len(text)
def percentage(count, total):
return 100 * count / total
In [24]:
lexical_diversity(text3), lexical_diversity(text5)
Out[24]:
In [25]:
percentage(text4.count('a'), len(text4))
Out[25]:
In [26]:
sent1
Out[26]:
In [27]:
sent2
Out[27]:
In [28]:
lexical_diversity(sent1)
Out[28]:
List Concatenation
In [29]:
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']
Out[29]:
Indexing Lists (...and Text objects)
In [30]:
text4[173]
Out[30]:
In [31]:
text4.index('awaken')
Out[31]:
In [32]:
text5[16715:16735]
Out[32]:
In [33]:
text6[1600:1625]
Out[33]:
In [34]:
saying = 'After all is said and done more is said than done'.split()
tokens = sorted(set(saying))
tokens[-2:]
Out[34]:
Frequency Distributions
In [35]:
fdist1 = FreqDist(text1)
print(fdist1)
In [36]:
fdist1.most_common(50)
Out[36]:
In [37]:
fdist1['whale']
Out[37]:
50 most frequent words account for almost half of the book
In [38]:
plt.figure(figsize=(18,10))
fdist1.plot(50, cumulative=True)
Fine-grained Selection of Words
Looking at long words of a text (maybe these will be more meaningful words?)
In [39]:
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
Out[39]:
words that are longer than 7 characters and occur more than 7 times
In [40]:
fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)
Out[40]:
collocation - sequence of words that occur together unusually often (red wine is a collocation, vs. the wine is not)
In [41]:
list(nltk.bigrams(['more', 'is', 'said', 'than', 'done'])) # bigrams() returns a generator
Out[41]:
collocations are just frequent bigrams -- we want to focus on the cases that involve rare words
collocations()
returns bigrams that occur more often than expected, based on word frequency
In [42]:
text4.collocations()
In [43]:
text8.collocations()
counting other things
word length distribution in text1
In [47]:
[len(w) for w in text1][:10]
Out[47]:
In [48]:
fdist = FreqDist(len(w) for w in text1)
print(fdist)
In [49]:
fdist
Out[49]:
In [50]:
fdist.most_common()
Out[50]:
In [52]:
fdist.max()
Out[52]:
In [53]:
fdist[3]
Out[53]:
In [54]:
fdist.freq(3)
Out[54]:
words of length 3 (~50k) make up ~20% of all words in the book
In [56]:
len(text1)
Out[56]:
In [57]:
len(set(text1))
Out[57]:
In [58]:
len(set(word.lower() for word in text1))
Out[58]:
Only include alphabetic words -- no punctuation
In [59]:
len(set(word.lower() for word in text1 if word.isalpha()))
Out[59]:
In [ ]: