In [ ]:
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
#open and read the novels, save them as variables
austen_string = open('../Data/Austen_PrideAndPrejudice.txt', encoding='utf-8').read()
alcott_string = open('../Data/Alcott_GarlandForGirls.txt', encoding='utf-8').read()
In [ ]:
#tokenize the text
austen_list = word_tokenize(austen_string)
alcott_list = word_tokenize(alcott_string)
print(austen_list[:10])
print(alcott_list[:10])
In [ ]:
#pre-processing
#remove punctuation and lowercase. We can do this in one line!
punctuation = list(string.punctuation)
austen_list_clean = [word.lower() for word in austen_list if word not in punctuation]
alcott_list_clean = [word.lower() for word in alcott_list if word not in punctuation]
print(austen_list_clean[:10])
print(alcott_list_clean[:10])
In [ ]:
print("TTR for Pride and Prejudice")
print(len(set(austen_list_clean))/len(austen_list_clean))
print("TTR for A Garland for Girls")
print(len(set(alcott_list_clean))/len(alcott_list_clean))
In [ ]:
austen_word_frequency = nltk.FreqDist(austen_list_clean)
alcott_word_frequency = nltk.FreqDist(alcott_list_clean)
print("Frequent words in Pride and Prejudice")
print(austen_word_frequency.most_common(10))
print("Frequent words in A Garland for Girls")
print(alcott_word_frequency.most_common(10))
In [ ]:
austen_word_frequency.plot(50, cumulative=True)
austen_word_frequency.plot(50, cumulative=False)
The most frequent words are, of course, the stop words. These words do not tell us much about the content of the text.
In [ ]:
austen_list_clean_sw = [word for word in austen_list_clean if word not in stopwords.words('english')]
alcott_list_clean_sw = [word for word in alcott_list_clean if word not in stopwords.words('english')]
austen_word_frequency_sw = nltk.FreqDist(austen_list_clean_sw)
alcott_word_frequency_sw = nltk.FreqDist(alcott_list_clean_sw)
print("Frequent words in Pride and Prejudice")
print(austen_word_frequency_sw.most_common(20))
print()
print("Frequent words in A Garland for Girls")
print(alcott_word_frequency_sw.most_common(20))
The NLTK package has many built-in functions for natural language processing. I encourage you to explore the full range of techniques available. I'll go over two more here: concordance() and similar().
The concordance() function lists out every time the specified words appears in the text along with the surrounding context.
In [ ]:
marx_string = open('../Data/Marx_CommunistManifesto.txt', encoding='utf-8').read()
prince_string = open('../Data/Machiavelli_ThePrince.txt', encoding='utf-8').read()
marx_list = word_tokenize(marx_string)
prince_list = word_tokenize(prince_string)
marx_nltk = nltk.Text(marx_list)
prince_nltk = nltk.Text(prince_list)
print(prince_nltk)
marx_nltk
In [ ]:
marx_nltk.concordance('people')
prince_nltk.concordance('people')
The text.similar() method takes a word w, finds all contexts w1 w w2, then finds all words w' that appear in the same context, i.e. w1 w' w2
In [ ]:
print("Marx")
marx_nltk.similar('people')
print()
print("Machiavelli")
prince_nltk.similar('people')