In [1]:
## Imports!
%matplotlib inline
import os
import re
import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.mlab import PCA
from scipy.cluster.vq import kmeans, vq
In [2]:
os.chdir("/home/ryan/School/scientific_computing/labs/lab4/books")
filenames = os.listdir()
books = []
for name in filenames:
with open(name) as f:
books.append(f.read())
In [3]:
def get_title(text):
pattern = "\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
m = re.search(pattern, text)
if m:
return m.group(2).strip()
return None
def remove_gutenberg_info(text):
pattern = "\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
start = re.search(pattern, text).end()
pattern = "\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
end = re.search(pattern, text).start()
return text[start:end]
In [4]:
cut_off_books = { get_title(book):remove_gutenberg_info(book) for book in books}
pd.DataFrame(cut_off_books, index=["Book's Text"]).T.head()
Out[4]:
Next, we iterate through all of the words, strip all characters that are not upper or lower-case letters. If the the resulting word is considered, non-empty, we throw it out. Else, we add the word in all lowercase stripped of all non-ASCII letters to our list of words for that book.
This is useful to determine word frequencies.
In [5]:
def strip_word(word, alphabet):
ret = ""
for c in word:
if c in alphabet:
ret += c.lower()
if len(ret) == 0:
return None
else:
return ret
def get_words(book):
alphabet = set(string.ascii_letters)
b = book.split()
words = []
for word in b:
w = strip_word(word, alphabet)
if w:
words.append(w)
return words
In [6]:
cut_books = {name:get_words(book) for name, book in cut_off_books.items()}
In [7]:
def get_word_freq(words):
word_counts = {}
for word in words:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
return word_counts
In [8]:
book_freqs = {}
for name, words in cut_books.items():
book_freqs[name] = get_word_freq(words)
In [9]:
total_word_count = {}
for dicts in book_freqs.values():
for word, count in dicts.items():
if word in total_word_count:
total_word_count[word] += count
else:
total_word_count[word] = count
In [10]:
a, b = zip(*total_word_count.items())
tuples = list(zip(b, a))
tuples.sort()
tuples.reverse()
tuples[:20]
Out[10]:
In [11]:
_, top_20_words = zip(*tuples[:20])
top_20_words
Out[11]:
In [12]:
def filter_frequencies(frequencies, words):
d = {}
for word, freq in frequencies.items():
if word in words:
d[word] = freq
return d
labels = {}
for name, freqs in book_freqs.items():
labels[name] = filter_frequencies(freqs, top_20_words)
In [13]:
df = pd.DataFrame(labels).fillna(0)
df = (df / df.sum()).T
df.head()
Out[13]:
In [14]:
kvals = []
dists = []
for k in range(2, 11):
centroids, distortion = kmeans(df, k)
kvals.append(k)
dists.append(distortion)
In [15]:
plt.plot(kvals, dists)
plt.show()
In [16]:
centroids, _ = kmeans(df, 3)
idx, _ = vq(df, centroids)
clusters = {}
for i, cluster in enumerate(idx):
if cluster in clusters:
clusters[cluster].append(df.iloc[i].name)
else:
clusters[cluster] = [df.iloc[i].name]
clusters
Out[16]:
In [17]:
m = PCA(df)
fig, ax = plt.subplots()
for i in range(len(idx)):
plt.plot(m.Y[idx==i, 0], m.Y[idx==i, 1], "o", alpha=.75)
for index, (x, y) in enumerate(zip(m.Y[:, 0], m.Y[:, 1])):
plt.text(x, y, df.index[index])
fig.set_size_inches(36,40)
plt.show()
In [18]:
m.sigma.sort_values()[-2:]
Out[18]:
We can see the data clusters well and the most important words are i and the based on them having the standard deviation. This is based on the concept of PCA.fracs
aligning to the variance based on this documentation: https://www.clear.rice.edu/comp130/12spring/pca/pca_docs.shtml. And, since PCA.sigma
is the square root of the variance, the highest standard deviation should correspond to the highest value for the PCA.fracs
. Then, i and the are the most important words
In [19]:
with open("../pg45.txt") as f:
anne = f.read()
get_title(anne)
Out[19]:
In [20]:
anne_cut = remove_gutenberg_info(anne)
anne_words = get_words(anne_cut)
anne_freq = {get_title(anne):filter_frequencies(get_word_freq(anne_words), top_20_words)}
anne_frame = pd.DataFrame(anne_freq).fillna(0)
anne_frame = (anne_frame / anne_frame.sum()).T
anne_frame
Out[20]:
Now, let's do k-means based on the previously determined k.
In [21]:
df_with_anne = df.append(anne_frame).sort_index()
centroids, _ = kmeans(df_with_anne, 3)
idx2, _ = vq(df_with_anne, centroids)
clusters = {}
for i, cluster in enumerate(idx2):
if cluster in clusters:
clusters[cluster].append(df_with_anne.iloc[i].name)
else:
clusters[cluster] = [df_with_anne.iloc[i].name]
clusters
Out[21]:
In [34]:
coords = m.project(np.array(anne_frame).flatten())
fig, _ = plt.subplots()
plt.plot(coords[0], coords[1], "s", markeredgewidth=5)
for i in range(len(idx)):
plt.plot(m.Y[idx==i, 0], m.Y[idx==i, 1], "o", alpha=.75)
for index, (x, y) in enumerate(zip(m.Y[:, 0], m.Y[:, 1])):
plt.text(x, y, df.index[index])
fig.set_size_inches(36,40)
plt.show()
We can see that the new book is the black square above. In addition, it makes sense it fits into that cluster especially when we compare it to Jane Eyre.
In [23]:
stop_words_text = open("../common-english-words.txt").read()
stop_words = stop_words_text.split(",")
stop_words[:5]
Out[23]:
In [24]:
word_counts_without_stop = [t for t in tuples if t[1] not in stop_words]
word_counts_without_stop[:20]
Out[24]:
In [25]:
_, top_20_without_stop = zip(*word_counts_without_stop[:20])
top_20_without_stop
Out[25]:
In [26]:
no_stop_labels = {}
for name, freqs in book_freqs.items():
no_stop_labels[name] = filter_frequencies(freqs, top_20_without_stop)
In [27]:
df_without_stop = pd.DataFrame(no_stop_labels).fillna(0)
df_without_stop = (df_without_stop / df_without_stop.sum()).T
df_without_stop.head()
Out[27]:
In [28]:
kvals = []
dists = []
for k in range(2, 11):
centroids, distortion = kmeans(df_without_stop, k)
kvals.append(k)
dists.append(distortion)
In [29]:
plt.plot(kvals, dists)
plt.show()
We can see that our k could be 3 or 7. Let's choose 7.
In [30]:
centroids, _ = kmeans(df_without_stop, 7)
idx3, _ = vq(df, centroids)
clusters = {}
for i, cluster in enumerate(idx3):
if cluster in clusters:
clusters[cluster].append(df_without_stop.iloc[i].name)
else:
clusters[cluster] = [df_without_stop.iloc[i].name]
clusters
Out[30]:
In [33]:
m2 = PCA(df_without_stop)
fig, _ = plt.subplots()
for i in range(len(idx3)):
plt.plot(m2.Y[idx3==i, 0], m2.Y[idx3==i, 1], "o", alpha=.75)
for index, (x, y) in enumerate(zip(m2.Y[:, 0], m2.Y[:, 1])):
plt.text(x, y, df_without_stop.index[index])
fig.set_size_inches(36,40)
plt.show()
In [32]:
m2.sigma.sort_values()[-2:]
Out[32]:
We can see that man and mr are the most important words in this set. This seems to signify male-dominated stories and characters. This makes sense given that historically stories typically focus on men.