Experiment 2 Annotator

This annotates the text with the number of times that passage has been quoted. Uses the rewritten text matcher.



In [11]:

    
import pandas as pd
import nltk
%matplotlib inline
import math
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
from colour import Color
from IPython.core.display import HTML
from matplotlib import cm
from matplotlib.colors import rgb2hex
plt.rcParams["figure.figsize"] = [16, 6]



In [12]:

    
df = pd.read_json('../txt/e2a.json')



In [14]:

    
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
df['Decade'] = df['year'] - (df['year'] % 10)



In [15]:

    
# Adapted from text-matcher
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        
    @property
    def text(self):
        """ Reads the file in memory. """
        f = open(self.filename, encoding='utf-8', errors='ignore')
        return f.read() 

    @property
    def tokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens



In [16]:

    
mm = Text('../middlemarch.txt')



In [17]:

    
# Get the size of the text. 
textALength = 1793449

# I don't know why, but without the offset the novel ends too soon,
# with "unvisited tomb." This fixes it. 
offset = 2
textALength += offset

# Make an empty array the size of the text. 
tally = np.zeros(textALength, dtype=np.int)



In [18]:

    
len(tally)









    Out[18]:





1793451



In [20]:

    
# Read the locations from the CSV file, and literally evaluate them into lists. 
locations = df['Locations in A']
# locations = locations.apply(literal_eval)



In [21]:

    
type(locations[0])









    Out[21]:





list



In [22]:

    
# Tally up every time a letter in the novel is quoted. 
for article in locations: 
    for locRange in article: 
         for i in range(locRange[0], min(locRange[1]+1, len(tally))):
                tally[i] += 1



In [23]:

    
mm.text[1793218:1793418]









    Out[23]:





'growing good of the world is partly\ndependent on unhistoric acts; and that things are not so ill with you\nand me as they might have been, is half owing to the number who lived\nfaithfully a hidden life'



In [42]:

    
# Make a color list in hex for all the values in the tally. 
# Let's hope there aren't too many. 
normalized = [int(round((i/tally.max() * 255))) for i in range(tally.max()+1)]
colorList = [rgb2hex(x) for x in cm.gnuplot(normalized)]



In [43]:

    
# Create a CSS Stylesheet for each color value in the map. 
colorCSS = ""
for i, color in zip(range(0, tally.max()+1), colorList): 
    colorCSS += ".c-%s { color: %s; }\n" % (i, color)



In [46]:

    
n = 50

checkpoints = np.linspace(0, textALength, n).round()
checkpoints = [int(point) for point in checkpoints]



In [47]:

    
def span(val): 
    return '<span class="c-%s">' % val

previousVal = None
for i, valChar in enumerate(zip(tally, mm.text)):
    val, char = valChar[0], valChar[1]
    if previousVal == None: 
        # First character. 
        out = '<span class="c-%s">' % val
    elif val != previousVal: 
        out += '</span><span class="c-%s">' % val
    if i in checkpoints: 
        out += '<a name="b-%s"></a>' % checkpoints.index(i)
    out += char
    previousVal = val



In [48]:

    
len(df)









    Out[48]:





6069



In [49]:

    
# Make a list of valid decades. 
decades = np.arange(1930, 2020, 10)

# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = row['Locations in A']
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations



In [50]:

    
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}

decadesBinned = {decade: 
                 np.histogram(locations, bins=n, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}

decadesDF = pd.DataFrame(decadesBinned).T



In [51]:

    
# Normalize the totals for each section. 
normalizedBlocks = decadesDF.sum() / decadesDF.sum().max()

# Now use the scale that we're already using for the CSS. 
normalizedBlocks = round(normalizedBlocks * tally.max())



In [52]:

    
blockHTML = '<section id="blocks">'
for i, block in enumerate(normalizedBlocks): 
    blockHTML += '<div class="block b-%s"><a class="block" href="#b-%s">%s</a></div>' % (int(block), i, i)
blockHTML = blockHTML + "</section>"



In [53]:

    
blockCSS = """
#blocks {  }
.block, .block { 
  width: 30px; 
  height: 30px; 
  display: inline-block;
  color: white; 
  text-align: center;
  text-decoration: none;
  margin-top: 3px; 
}
"""

for i, color in zip(range(0, tally.max()+1), colorList): 
    blockCSS += '.b-%s { background-color: %s; }\n' % (i, color)
colorCSS += blockCSS



In [54]:

    
html = """<!DOCTYPE html>
<html>
<head>
  <link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet"> 
  <style>
  pre { 
      font-family: Raleway, serif; 
      font-size: 18px; 
      font-weight: 600;
    }
  main { 
      width: 40em; 
      margin: 2em auto; 
  }
  %s
  </style>
</head>
<body>%s<main><pre>%s</pre></main></body></html>
""" % (colorCSS, blockHTML, out)



In [55]:

    
with open('annotated.html', 'w') as f: 
    f.write(html)
    f.close()



In [ ]: