Lab 4

Ryan Rose

Scientific Computing

9/21/2016



In [1]:

    
## Imports!

%matplotlib inline
import os
import re
import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.mlab import PCA
from scipy.cluster.vq import kmeans, vq

Loading Fifty Books

First, we load all fifty books from their text files.



In [2]:

    
os.chdir("/home/ryan/School/scientific_computing/labs/lab4/books")
filenames = os.listdir()

books = []
for name in filenames:
    with open(name) as f:
        books.append(f.read())

Cleaning up the Data

Next, we create a mapping of titles to their book's text along with removing the Project Gutenberg header and footer.



In [3]:

    
def get_title(text):
    pattern = "\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
    m = re.search(pattern, text)
    if m:
        return m.group(2).strip()
    return None

def remove_gutenberg_info(text):
    pattern = "\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
    start = re.search(pattern, text).end()
    pattern = "\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK ([A-Z,;' ]*)\*\*\*"
    end = re.search(pattern, text).start()
    return text[start:end]



In [4]:

    
cut_off_books = { get_title(book):remove_gutenberg_info(book) for book in books}
pd.DataFrame(cut_off_books, index=["Book's Text"]).T.head()









    Out[4]:






  
    
      
      Book's Text
    
  
  
    
      A DOLL'S HOUSE
      \n\n\n\n\nProduced by Martin Adamson\n\n\n\n\n...
    
    
      A MODEST PROPOSAL
      \n\n\n\n\nProduced by An Anonymous Volunteer\n...
    
    
      A TALE OF TWO CITIES
      \n\n\n\n\nProduced by Judith Boss\n\n\n\n\n\n\...
    
    
      ALICE'S ADVENTURES IN WONDERLAND
      \n\n\n\n\n\n\n\n\n\n\nALICE'S ADVENTURES IN WO...
    
    
      BEOWULF
      \n\n\n\n\nProduced by David Starner, Dainis Mi...

Next, we iterate through all of the words, strip all characters that are not upper or lower-case letters. If the the resulting word is considered, non-empty, we throw it out. Else, we add the word in all lowercase stripped of all non-ASCII letters to our list of words for that book.

This is useful to determine word frequencies.



In [5]:

    
def strip_word(word, alphabet):
    ret = ""
    for c in word:
        if c in alphabet:
            ret += c.lower()
    if len(ret) == 0:
        return None
    else:
        return ret

def get_words(book):
    alphabet = set(string.ascii_letters)
    b = book.split()
    words = []
    for word in b:
        w = strip_word(word, alphabet)
        if w:
            words.append(w)
    return words



In [6]:

    
cut_books = {name:get_words(book) for name, book in cut_off_books.items()}

Determining Frequencies

Now, we determine the frequencies for each word and putting them in a dictionary for each book.



In [7]:

    
def get_word_freq(words):
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    return word_counts



In [8]:

    
book_freqs = {}
for name, words in cut_books.items():
    book_freqs[name] = get_word_freq(words)

Top 20 Words

Now, let's determine the top 20 words across the whole corpus



In [9]:

    
total_word_count = {}
for dicts in book_freqs.values():
    for word, count in dicts.items():
        if word in total_word_count:
            total_word_count[word] += count
        else:
            total_word_count[word] = count



In [10]:

    
a, b = zip(*total_word_count.items())
tuples = list(zip(b, a))
tuples.sort()
tuples.reverse()
tuples[:20]









    Out[10]:





[(342034, 'the'),
 (200704, 'and'),
 (171744, 'of'),
 (158988, 'to'),
 (120018, 'a'),
 (97899, 'i'),
 (97785, 'in'),
 (77472, 'he'),
 (73146, 'that'),
 (68075, 'was'),
 (64897, 'it'),
 (60658, 'his'),
 (52443, 'you'),
 (50472, 'with'),
 (46951, 'is'),
 (43998, 'as'),
 (43656, 'had'),
 (43226, 'her'),
 (42166, 'for'),
 (40887, 'not')]



In [11]:

    
_, top_20_words = zip(*tuples[:20])
top_20_words









    Out[11]:





('the',
 'and',
 'of',
 'to',
 'a',
 'i',
 'in',
 'he',
 'that',
 'was',
 'it',
 'his',
 'you',
 'with',
 'is',
 'as',
 'had',
 'her',
 'for',
 'not')

Creating the 20-dimensional vectors

Using the top 20 words above, let's determine the book vectors.



In [12]:

    
def filter_frequencies(frequencies, words):
    d = {}
    for word, freq in frequencies.items():
        if word in words:
            d[word] = freq
    return d

labels = {}
for name, freqs in book_freqs.items():
    labels[name] = filter_frequencies(freqs, top_20_words)



In [13]:

    
df = pd.DataFrame(labels).fillna(0)
df = (df / df.sum()).T
df.head()









    Out[13]:






  
    
      
      a
      and
      as
      for
      had
      he
      her
      his
      i
      in
      is
      it
      not
      of
      that
      the
      to
      was
      with
      you
    
  
  
    
      A DOLL'S HOUSE
      0.060497
      0.073735
      0.022640
      0.026970
      0.010516
      0.024125
      0.019795
      0.013856
      0.115056
      0.040703
      0.055425
      0.067302
      0.022269
      0.050600
      0.054064
      0.098726
      0.084498
      0.023259
      0.017691
      0.118273
    
    
      A MODEST PROPOSAL
      0.089969
      0.118925
      0.040331
      0.047570
      0.000000
      0.006205
      0.002068
      0.010341
      0.056877
      0.072389
      0.021717
      0.026887
      0.016546
      0.143744
      0.040331
      0.175801
      0.110651
      0.007239
      0.012410
      0.000000
    
    
      A TALE OF TWO CITIES
      0.063308
      0.106873
      0.024690
      0.020546
      0.028139
      0.039725
      0.022520
      0.043500
      0.041591
      0.055997
      0.017747
      0.043413
      0.018203
      0.087043
      0.040983
      0.173913
      0.075132
      0.038250
      0.028335
      0.030092
    
    
      ALICE'S ADVENTURES IN WONDERLAND
      0.077388
      0.104172
      0.032338
      0.018884
      0.021970
      0.014811
      0.030610
      0.011849
      0.049371
      0.045297
      0.012589
      0.064922
      0.017773
      0.062824
      0.034066
      0.201308
      0.088990
      0.044063
      0.022340
      0.044433
    
    
      BEOWULF
      0.042015
      0.070633
      0.021921
      0.026705
      0.016006
      0.049408
      0.005654
      0.047930
      0.020355
      0.056367
      0.027314
      0.015484
      0.014614
      0.132046
      0.028967
      0.271138
      0.087161
      0.029923
      0.034534
      0.001827

Creating the Elbow Graph

Let's try each k and see what makes the sharpest elbow.



In [14]:

    
kvals = []
dists = []
for k in range(2, 11):
    centroids, distortion = kmeans(df, k)
    kvals.append(k)
    dists.append(distortion)



In [15]:

    
plt.plot(kvals, dists)
plt.show()

We can see that the best k is 3 or 6.

Clustering

Let's cluster based on k = 3 and plot the clusters.



In [16]:

    
centroids, _ = kmeans(df, 3)
idx, _ = vq(df, centroids)
clusters = {}
for i, cluster in enumerate(idx):
    if cluster in clusters:
        clusters[cluster].append(df.iloc[i].name)
    else:
        clusters[cluster] = [df.iloc[i].name]
clusters









    Out[16]:





{0: ["A DOLL'S HOUSE",
  'CAPTIVITY AND RESTORATION',
  'DRACULA',
  'EMMA',
  'FRANKENSTEIN',
  'GREAT EXPECTATIONS',
  'HUCKLEBERRY FINN',
  'JANE EYRE',
  'MANON LESCAUT',
  'PRIDE AND PREJUDICE',
  'SENSE AND SENSIBILITY',
  'STORY OF MY LIFE',
  'THE ADVENTURES OF SHERLOCK HOLMES',
  'THE GIRL NEXT DOOR',
  'THE IMPORTANCE OF BEING EARNEST',
  'THE MYSTERIOUS AFFAIR AT STYLES',
  'THE PICTURE OF DORIAN GRAY',
  'THE ROMANCE OF LUST',
  'THE YELLOW WALLPAPER',
  'WUTHERING HEIGHTS'],
 1: ['A TALE OF TWO CITIES',
  "ALICE'S ADVENTURES IN WONDERLAND",
  'DUBLINERS',
  'FREDERICK DOUGLASS',
  "GRIMMS' FAIRY TALES",
  "GULLIVER'S TRAVELS",
  'HEART OF DARKNESS',
  'LES MISERABLES',
  'METAMORPHOSIS',
  'PETER PAN',
  'SIDDHARTHA',
  'THE AWAKENING AND SELECTED',
  'THE COUNT OF MONTE CRISTO',
  'THE DIVINE COMEDY, COMPLETE',
  'THE PRINCE',
  'THREE MEN IN A BOAT',
  'TOM SAWYER',
  'TREASURE ISLAND',
  'ULYSSES',
  'WAR AND PEACE'],
 2: ['A MODEST PROPOSAL',
  'BEOWULF',
  'HOW TO ANALYZE PEOPLE ON SIGHT',
  'LEAVES OF GRASS',
  'MOBY DICK; OR THE WHALE',
  'STEAM, ITS GENERATION AND USE',
  'THE ILIAD OF HOMER',
  'THE KAMA SUTRA OF VATSYAYANA',
  'THE LEGEND OF SLEEPY HOLLOW',
  'THE REPUBLIC']}

Do the clusters make sense?

Yes. For instance, we can see that The Republic and The Iliad of Homer are in the same cluster.

Performing PCA

Now, let's perform PCA and determine the most important elements and plot the clusters.



In [17]:

    
m = PCA(df)

fig, ax = plt.subplots()

for i in range(len(idx)):
    plt.plot(m.Y[idx==i, 0], m.Y[idx==i, 1], "o", alpha=.75)

for index, (x, y) in enumerate(zip(m.Y[:, 0], m.Y[:, 1])):
    plt.text(x, y, df.index[index])
    
fig.set_size_inches(36,40)
plt.show()



In [18]:

    
m.sigma.sort_values()[-2:]









    Out[18]:





i      0.037461
the    0.046954
dtype: float64

We can see the data clusters well and the most important words are i and the based on them having the standard deviation. This is based on the concept of PCA.fracs aligning to the variance based on this documentation: https://www.clear.rice.edu/comp130/12spring/pca/pca_docs.shtml. And, since PCA.sigma is the square root of the variance, the highest standard deviation should correspond to the highest value for the PCA.fracs. Then, i and the are the most important words

New Book

So, we continue as before by loading Anne of Green Gables, parsing it, creating an array, and normalizing the book vector.



In [19]:

    
with open("../pg45.txt") as f:
    anne = f.read()
get_title(anne)









    Out[19]:





'ANNE OF GREEN GABLES'



In [20]:

    
anne_cut = remove_gutenberg_info(anne)
anne_words = get_words(anne_cut)
anne_freq = {get_title(anne):filter_frequencies(get_word_freq(anne_words), top_20_words)}
anne_frame = pd.DataFrame(anne_freq).fillna(0)
anne_frame = (anne_frame / anne_frame.sum()).T
anne_frame









    Out[20]:






  
    
      
      a
      and
      as
      for
      had
      he
      her
      his
      i
      in
      is
      it
      not
      of
      that
      the
      to
      was
      with
      you
    
  
  
    
      ANNE OF GREEN GABLES
      0.07306
      0.110804
      0.026105
      0.024209
      0.025606
      0.014067
      0.04353
      0.007116
      0.080642
      0.049017
      0.017891
      0.058495
      0.014299
      0.06355
      0.042034
      0.12976
      0.100895
      0.04526
      0.026171
      0.047488

Now, let's do k-means based on the previously determined k.



In [21]:

    
df_with_anne = df.append(anne_frame).sort_index()

centroids, _ = kmeans(df_with_anne, 3)
idx2, _ = vq(df_with_anne, centroids)
clusters = {}
for i, cluster in enumerate(idx2):
    if cluster in clusters:
        clusters[cluster].append(df_with_anne.iloc[i].name)
    else:
        clusters[cluster] = [df_with_anne.iloc[i].name]
clusters









    Out[21]:





{0: ["A DOLL'S HOUSE",
  'ANNE OF GREEN GABLES',
  'CAPTIVITY AND RESTORATION',
  'DRACULA',
  'EMMA',
  'FRANKENSTEIN',
  'GREAT EXPECTATIONS',
  'HUCKLEBERRY FINN',
  'JANE EYRE',
  'MANON LESCAUT',
  'PRIDE AND PREJUDICE',
  'SENSE AND SENSIBILITY',
  'STORY OF MY LIFE',
  'THE ADVENTURES OF SHERLOCK HOLMES',
  'THE GIRL NEXT DOOR',
  'THE IMPORTANCE OF BEING EARNEST',
  'THE MYSTERIOUS AFFAIR AT STYLES',
  'THE PICTURE OF DORIAN GRAY',
  'THE ROMANCE OF LUST',
  'THE YELLOW WALLPAPER',
  'WUTHERING HEIGHTS'],
 1: ['A TALE OF TWO CITIES',
  "ALICE'S ADVENTURES IN WONDERLAND",
  'DUBLINERS',
  'FREDERICK DOUGLASS',
  "GRIMMS' FAIRY TALES",
  "GULLIVER'S TRAVELS",
  'HEART OF DARKNESS',
  'LES MISERABLES',
  'METAMORPHOSIS',
  'PETER PAN',
  'SIDDHARTHA',
  'THE AWAKENING AND SELECTED',
  'THE COUNT OF MONTE CRISTO',
  'THE DIVINE COMEDY, COMPLETE',
  'THE PRINCE',
  'THREE MEN IN A BOAT',
  'TOM SAWYER',
  'TREASURE ISLAND',
  'ULYSSES',
  'WAR AND PEACE'],
 2: ['A MODEST PROPOSAL',
  'BEOWULF',
  'HOW TO ANALYZE PEOPLE ON SIGHT',
  'LEAVES OF GRASS',
  'MOBY DICK; OR THE WHALE',
  'STEAM, ITS GENERATION AND USE',
  'THE ILIAD OF HOMER',
  'THE KAMA SUTRA OF VATSYAYANA',
  'THE LEGEND OF SLEEPY HOLLOW',
  'THE REPUBLIC']}



In [34]:

    
coords = m.project(np.array(anne_frame).flatten())

fig, _ = plt.subplots()

plt.plot(coords[0], coords[1], "s", markeredgewidth=5)

for i in range(len(idx)):
    plt.plot(m.Y[idx==i, 0], m.Y[idx==i, 1], "o", alpha=.75)

for index, (x, y) in enumerate(zip(m.Y[:, 0], m.Y[:, 1])):
    plt.text(x, y, df.index[index])
    
fig.set_size_inches(36,40)

plt.show()

We can see that the new book is the black square above. In addition, it makes sense it fits into that cluster especially when we compare it to Jane Eyre.

Stop Words



In [23]:

    
stop_words_text = open("../common-english-words.txt").read()
stop_words = stop_words_text.split(",")
stop_words[:5]









    Out[23]:





['a', 'able', 'about', 'across', 'after']



In [24]:

    
word_counts_without_stop = [t for t in tuples if t[1] not in stop_words]
word_counts_without_stop[:20]









    Out[24]:





[(21027, 'one'),
 (13372, 'out'),
 (12998, 'up'),
 (12103, 'more'),
 (11860, 'now'),
 (11155, 'very'),
 (10975, 'man'),
 (9634, 'time'),
 (8722, 'little'),
 (8714, 'see'),
 (8506, 'well'),
 (8069, 'know'),
 (7950, 'such'),
 (7757, 'two'),
 (7672, 'before'),
 (7439, 'mr'),
 (7327, 'down'),
 (7141, 'over'),
 (7060, 'good'),
 (6807, 'upon')]



In [25]:

    
_, top_20_without_stop = zip(*word_counts_without_stop[:20])
top_20_without_stop









    Out[25]:





('one',
 'out',
 'up',
 'more',
 'now',
 'very',
 'man',
 'time',
 'little',
 'see',
 'well',
 'know',
 'such',
 'two',
 'before',
 'mr',
 'down',
 'over',
 'good',
 'upon')



In [26]:

    
no_stop_labels = {}
for name, freqs in book_freqs.items():
    no_stop_labels[name] = filter_frequencies(freqs, top_20_without_stop)



In [27]:

    
df_without_stop = pd.DataFrame(no_stop_labels).fillna(0)
df_without_stop = (df_without_stop / df_without_stop.sum()).T
df_without_stop.head()









    Out[27]:






  
    
      
      before
      down
      good
      know
      little
      man
      more
      mr
      now
      one
      out
      over
      see
      such
      time
      two
      up
      upon
      very
      well
    
  
  
    
      A DOLL'S HOUSE
      0.015654
      0.033149
      0.036832
      0.090239
      0.087477
      0.038674
      0.030387
      0.009208
      0.096685
      0.081031
      0.076427
      0.032228
      0.056169
      0.035912
      0.034070
      0.022099
      0.074586
      0.011971
      0.063536
      0.073665
    
    
      A MODEST PROPOSAL
      0.029412
      0.000000
      0.078431
      0.000000
      0.068627
      0.019608
      0.058824
      0.000000
      0.058824
      0.147059
      0.019608
      0.000000
      0.019608
      0.029412
      0.049020
      0.078431
      0.039216
      0.127451
      0.098039
      0.078431
    
    
      A TALE OF TWO CITIES
      0.042186
      0.042553
      0.038335
      0.042003
      0.048606
      0.051174
      0.047689
      0.113720
      0.046955
      0.080154
      0.081621
      0.031548
      0.035767
      0.031915
      0.047689
      0.038701
      0.056126
      0.053008
      0.039618
      0.030631
    
    
      ALICE'S ADVENTURES IN WONDERLAND
      0.028884
      0.078845
      0.018735
      0.066354
      0.099922
      0.003903
      0.037471
      0.000000
      0.044496
      0.078845
      0.088212
      0.031226
      0.051522
      0.032006
      0.053084
      0.030445
      0.076503
      0.020297
      0.112412
      0.046838
    
    
      BEOWULF
      0.028274
      0.029762
      0.046131
      0.020833
      0.043155
      0.075893
      0.055060
      0.000000
      0.061012
      0.196429
      0.050595
      0.043155
      0.058036
      0.040179
      0.038690
      0.038690
      0.035714
      0.035714
      0.059524
      0.043155



In [28]:

    
kvals = []
dists = []
for k in range(2, 11):
    centroids, distortion = kmeans(df_without_stop, k)
    kvals.append(k)
    dists.append(distortion)



In [29]:

    
plt.plot(kvals, dists)
plt.show()

We can see that our k could be 3 or 7. Let's choose 7.



In [30]:

    
centroids, _ = kmeans(df_without_stop, 7)
idx3, _ = vq(df, centroids)
clusters = {}
for i, cluster in enumerate(idx3):
    if cluster in clusters:
        clusters[cluster].append(df_without_stop.iloc[i].name)
    else:
        clusters[cluster] = [df_without_stop.iloc[i].name]
clusters









    Out[30]:





{1: ['METAMORPHOSIS', 'SIDDHARTHA', 'THE ILIAD OF HOMER'],
 5: ["A DOLL'S HOUSE",
  'CAPTIVITY AND RESTORATION',
  'DRACULA',
  'EMMA',
  "GRIMMS' FAIRY TALES",
  'HOW TO ANALYZE PEOPLE ON SIGHT',
  'HUCKLEBERRY FINN',
  'JANE EYRE',
  'LEAVES OF GRASS',
  'STORY OF MY LIFE',
  'THE IMPORTANCE OF BEING EARNEST',
  'THE PRINCE',
  'THE REPUBLIC',
  'THE YELLOW WALLPAPER',
  'THREE MEN IN A BOAT',
  'WUTHERING HEIGHTS'],
 6: ['A MODEST PROPOSAL',
  'A TALE OF TWO CITIES',
  "ALICE'S ADVENTURES IN WONDERLAND",
  'BEOWULF',
  'DUBLINERS',
  'FRANKENSTEIN',
  'FREDERICK DOUGLASS',
  'GREAT EXPECTATIONS',
  "GULLIVER'S TRAVELS",
  'HEART OF DARKNESS',
  'LES MISERABLES',
  'MANON LESCAUT',
  'MOBY DICK; OR THE WHALE',
  'PETER PAN',
  'PRIDE AND PREJUDICE',
  'SENSE AND SENSIBILITY',
  'STEAM, ITS GENERATION AND USE',
  'THE ADVENTURES OF SHERLOCK HOLMES',
  'THE AWAKENING AND SELECTED',
  'THE COUNT OF MONTE CRISTO',
  'THE DIVINE COMEDY, COMPLETE',
  'THE GIRL NEXT DOOR',
  'THE KAMA SUTRA OF VATSYAYANA',
  'THE LEGEND OF SLEEPY HOLLOW',
  'THE MYSTERIOUS AFFAIR AT STYLES',
  'THE PICTURE OF DORIAN GRAY',
  'THE ROMANCE OF LUST',
  'TOM SAWYER',
  'TREASURE ISLAND',
  'ULYSSES',
  'WAR AND PEACE']}



In [33]:

    
m2 = PCA(df_without_stop)

fig, _ = plt.subplots()

for i in range(len(idx3)):
    plt.plot(m2.Y[idx3==i, 0], m2.Y[idx3==i, 1], "o", alpha=.75)
    
for index, (x, y) in enumerate(zip(m2.Y[:, 0], m2.Y[:, 1])):
    plt.text(x, y, df_without_stop.index[index])
    
fig.set_size_inches(36,40)
plt.show()



In [32]:

    
m2.sigma.sort_values()[-2:]









    Out[32]:





man    0.039617
mr     0.049177
dtype: float64

We can see that man and mr are the most important words in this set. This seems to signify male-dominated stories and characters. This makes sense given that historically stories typically focus on men.

Conclusion

We can see that books' words cluster based on year and genre. In addition, we found an interesting finding without stop words that this specific data set is male-dominated.

	Book's Text
A DOLL'S HOUSE	\n\n\n\n\nProduced by Martin Adamson\n\n\n\n\n...
A MODEST PROPOSAL	\n\n\n\n\nProduced by An Anonymous Volunteer\n...
A TALE OF TWO CITIES	\n\n\n\n\nProduced by Judith Boss\n\n\n\n\n\n\...
ALICE'S ADVENTURES IN WONDERLAND	\n\n\n\n\n\n\n\n\n\n\nALICE'S ADVENTURES IN WO...
BEOWULF	\n\n\n\n\nProduced by David Starner, Dainis Mi...

	a	and	as	for	had	he	her	his	i	in	is	it	not	of	that	the	to	was	with	you
A DOLL'S HOUSE	0.060497	0.073735	0.022640	0.026970	0.010516	0.024125	0.019795	0.013856	0.115056	0.040703	0.055425	0.067302	0.022269	0.050600	0.054064	0.098726	0.084498	0.023259	0.017691	0.118273
A MODEST PROPOSAL	0.089969	0.118925	0.040331	0.047570	0.000000	0.006205	0.002068	0.010341	0.056877	0.072389	0.021717	0.026887	0.016546	0.143744	0.040331	0.175801	0.110651	0.007239	0.012410	0.000000
A TALE OF TWO CITIES	0.063308	0.106873	0.024690	0.020546	0.028139	0.039725	0.022520	0.043500	0.041591	0.055997	0.017747	0.043413	0.018203	0.087043	0.040983	0.173913	0.075132	0.038250	0.028335	0.030092
ALICE'S ADVENTURES IN WONDERLAND	0.077388	0.104172	0.032338	0.018884	0.021970	0.014811	0.030610	0.011849	0.049371	0.045297	0.012589	0.064922	0.017773	0.062824	0.034066	0.201308	0.088990	0.044063	0.022340	0.044433
BEOWULF	0.042015	0.070633	0.021921	0.026705	0.016006	0.049408	0.005654	0.047930	0.020355	0.056367	0.027314	0.015484	0.014614	0.132046	0.028967	0.271138	0.087161	0.029923	0.034534	0.001827

	before	down	good	know	little	man	more	mr	now	one	out	over	see	such	time	two	up	upon	very	well
A DOLL'S HOUSE	0.015654	0.033149	0.036832	0.090239	0.087477	0.038674	0.030387	0.009208	0.096685	0.081031	0.076427	0.032228	0.056169	0.035912	0.034070	0.022099	0.074586	0.011971	0.063536	0.073665
A MODEST PROPOSAL	0.029412	0.000000	0.078431	0.000000	0.068627	0.019608	0.058824	0.000000	0.058824	0.147059	0.019608	0.000000	0.019608	0.029412	0.049020	0.078431	0.039216	0.127451	0.098039	0.078431
A TALE OF TWO CITIES	0.042186	0.042553	0.038335	0.042003	0.048606	0.051174	0.047689	0.113720	0.046955	0.080154	0.081621	0.031548	0.035767	0.031915	0.047689	0.038701	0.056126	0.053008	0.039618	0.030631
ALICE'S ADVENTURES IN WONDERLAND	0.028884	0.078845	0.018735	0.066354	0.099922	0.003903	0.037471	0.000000	0.044496	0.078845	0.088212	0.031226	0.051522	0.032006	0.053084	0.030445	0.076503	0.020297	0.112412	0.046838
BEOWULF	0.028274	0.029762	0.046131	0.020833	0.043155	0.075893	0.055060	0.000000	0.061012	0.196429	0.050595	0.043155	0.058036	0.040179	0.038690	0.038690	0.035714	0.035714	0.059524	0.043155