In [45]:
% matplotlib inline
import codecs
import re
import copy
import collections
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from __future__ import division
We need some specialized functions from NLTK that are not included by default. It is possible to download just the "stopwords" portion but it may be easier to simply download everything in NLTK. Note that this is very time consuming; it took over 30 minutes on my machine.
In [ ]:
nltk.download('all')
Get the "stopwords" package from NLTK.
In [46]:
from nltk.corpus import stopwords
In [47]:
with codecs.open("JaneEyre.txt", "r", encoding="utf-8") as f:
jane_eyre = f.read()
with codecs.open("WutheringHeights.txt", "r", encoding="utf-8") as f:
wuthering_heights = f.read()
In [48]:
esw = stopwords.words('english')
esw.append("would")
Filter tokens (using regular expressions).
In [49]:
word_pattern = re.compile("^\w+$")
Create a token counter function.
In [50]:
def get_text_counter(text):
tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
tokens = list(map(lambda x: x.lower(), tokens))
tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
return collections.Counter(tokens), len(tokens)
Create a function to calculate the absolute frequency and relative frequency of the most common words.
In [51]:
def make_df(counter, size):
abs_freq = np.array([el[1] for el in counter])
rel_freq = abs_freq / size
index = [el[0] for el in counter]
df = pd.DataFrame(data=np.array([abs_freq, rel_freq]).T, index=index, columns=["Absolute frequency", "Relative frequency"])
df.index.name = "Most common words"
return df
Calculate the most common words of Jane Eyre. This takes a while. Then display the 10 most common.
In [52]:
je_counter, je_size = get_text_counter(jane_eyre)
make_df(je_counter.most_common(10), je_size)
Out[52]:
Save the 1000 most common words of Jane Eyre to CSV.
In [53]:
je_df = make_df(je_counter.most_common(1000), je_size)
je_df.to_csv("JE_1000.csv")
Calculate the most common words of Wuthering Heights. This also takes a while. Display the 10 most common.
In [54]:
wh_counter, wh_size = get_text_counter(wuthering_heights)
make_df(wh_counter.most_common(10), wh_size)
Out[54]:
Save the 1000 most common words of Wuthering Heights to CSV.
In [55]:
wh_df = make_df(wh_counter.most_common(1000), wh_size)
wh_df.to_csv("WH_1000.csv")
Find the most common words across the two documents.
In [56]:
all_counter = wh_counter + je_counter
all_df = make_df(wh_counter.most_common(1000), 1)
most_common_words = all_df.index.values
Create a data frame with the word frequency differences.
In [57]:
df_data = []
for word in most_common_words:
je_c = je_counter.get(word, 0) / je_size
wh_c = wh_counter.get(word, 0) / wh_size
d = abs(je_c - wh_c)
df_data.append([je_c, wh_c, d])
dist_df = pd.DataFrame(data=df_data, index=most_common_words,
columns=["Jane Eyre relative frequency", "Wuthering Heights relative frequency",
"Relative frequency difference"])
dist_df.index.name = "Most common words"
dist_df.sort_values("Relative frequency difference", ascending=False, inplace=True)
Display the most distinctive words.
In [58]:
dist_df.head(10)
Out[58]:
Save the full list of distinctive words to a CSV entitled "bronte.csv."
In [59]:
dist_df.to_csv("bronte.csv")
In [ ]: