Experiment 2 Annotator

This annotates the text with the number of times that passage has been quoted. Uses the rewritten text matcher.


In [11]:
import pandas as pd
import nltk
%matplotlib inline
import math
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
from colour import Color
from IPython.core.display import HTML
from matplotlib import cm
from matplotlib.colors import rgb2hex
plt.rcParams["figure.figsize"] = [16, 6]

In [12]:
df = pd.read_json('../txt/e2a.json')

In [14]:
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
df['Decade'] = df['year'] - (df['year'] % 10)

In [15]:
# Adapted from text-matcher
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        
    @property
    def text(self):
        """ Reads the file in memory. """
        f = open(self.filename, encoding='utf-8', errors='ignore')
        return f.read() 

    @property
    def tokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens

In [16]:
mm = Text('../middlemarch.txt')

In [17]:
# Get the size of the text. 
textALength = 1793449

# I don't know why, but without the offset the novel ends too soon,
# with "unvisited tomb." This fixes it. 
offset = 2
textALength += offset

# Make an empty array the size of the text. 
tally = np.zeros(textALength, dtype=np.int)

In [18]:
len(tally)


Out[18]:
1793451

In [20]:
# Read the locations from the CSV file, and literally evaluate them into lists. 
locations = df['Locations in A']
# locations = locations.apply(literal_eval)

In [21]:
type(locations[0])


Out[21]:
list

In [22]:
# Tally up every time a letter in the novel is quoted. 
for article in locations: 
    for locRange in article: 
         for i in range(locRange[0], min(locRange[1]+1, len(tally))):
                tally[i] += 1

In [23]:
mm.text[1793218:1793418]


Out[23]:
'growing good of the world is partly\ndependent on unhistoric acts; and that things are not so ill with you\nand me as they might have been, is half owing to the number who lived\nfaithfully a hidden life'

In [42]:
# Make a color list in hex for all the values in the tally. 
# Let's hope there aren't too many. 
normalized = [int(round((i/tally.max() * 255))) for i in range(tally.max()+1)]
colorList = [rgb2hex(x) for x in cm.gnuplot(normalized)]

In [43]:
# Create a CSS Stylesheet for each color value in the map. 
colorCSS = ""
for i, color in zip(range(0, tally.max()+1), colorList): 
    colorCSS += ".c-%s { color: %s; }\n" % (i, color)

In [46]:
n = 50

checkpoints = np.linspace(0, textALength, n).round()
checkpoints = [int(point) for point in checkpoints]

In [47]:
def span(val): 
    return '<span class="c-%s">' % val

previousVal = None
for i, valChar in enumerate(zip(tally, mm.text)):
    val, char = valChar[0], valChar[1]
    if previousVal == None: 
        # First character. 
        out = '<span class="c-%s">' % val
    elif val != previousVal: 
        out += '</span><span class="c-%s">' % val
    if i in checkpoints: 
        out += '<a name="b-%s"></a>' % checkpoints.index(i)
    out += char
    previousVal = val

In [48]:
len(df)


Out[48]:
6069

In [49]:
# Make a list of valid decades. 
decades = np.arange(1930, 2020, 10)

# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = row['Locations in A']
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations

In [50]:
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}

decadesBinned = {decade: 
                 np.histogram(locations, bins=n, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}

decadesDF = pd.DataFrame(decadesBinned).T

In [51]:
# Normalize the totals for each section. 
normalizedBlocks = decadesDF.sum() / decadesDF.sum().max()

# Now use the scale that we're already using for the CSS. 
normalizedBlocks = round(normalizedBlocks * tally.max())

In [52]:
blockHTML = '<section id="blocks">'
for i, block in enumerate(normalizedBlocks): 
    blockHTML += '<div class="block b-%s"><a class="block" href="#b-%s">%s</a></div>' % (int(block), i, i)
blockHTML = blockHTML + "</section>"

In [53]:
blockCSS = """
#blocks {  }
.block, .block { 
  width: 30px; 
  height: 30px; 
  display: inline-block;
  color: white; 
  text-align: center;
  text-decoration: none;
  margin-top: 3px; 
}
"""

for i, color in zip(range(0, tally.max()+1), colorList): 
    blockCSS += '.b-%s { background-color: %s; }\n' % (i, color)
colorCSS += blockCSS

In [54]:
html = """<!DOCTYPE html>
<html>
<head>
  <link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet"> 
  <style>
  pre { 
      font-family: Raleway, serif; 
      font-size: 18px; 
      font-weight: 600;
    }
  main { 
      width: 40em; 
      margin: 2em auto; 
  }
  %s
  </style>
</head>
<body>%s<main><pre>%s</pre></main></body></html>
""" % (colorCSS, blockHTML, out)

In [55]:
with open('annotated.html', 'w') as f: 
    f.write(html)
    f.close()

In [ ]: