Experiment 0 Annotator

This annotates the text with the number of times that passage has been quoted.


In [1]:
import pandas as pd
import nltk
%matplotlib inline
import math
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
from colour import Color
from IPython.core.display import HTML
from matplotlib import cm
from matplotlib.colors import rgb2hex
plt.rcParams["figure.figsize"] = [16, 6]

In [2]:
df = pd.read_csv('e0d/log.txt')
df.head()


Out[2]:
Text A Text B Threshold N-Grams Num Matches Text A Length Text B Length Locations in A Locations in B
0 middlemarch.txt e0b/e0b-txt/WOLFE_2002_IRIS MURDOCH APPLIED TO... 6 3 17 1793446 43119 [(539109, 539353), (539391, 539432), (539504, ... [(26234, 26478), (26481, 26521), (26567, 26802...
1 middlemarch.txt e0b/e0b-txt/Hardy_1954_The Moment of Disenchan... 6 3 3 1793446 20226 [(580711, 580936), (580948, 581020), (1691325,... [(9662, 9886), (9897, 9969), (19879, 20002)]
2 middlemarch.txt e0b/e0b-txt/MORRIS_1990_THE DIALOGIC UNIVERSE ... 6 3 21 1793446 47530 [(1615, 1964), (29948, 30025), (40132, 40256),... [(4254, 4605), (5612, 5690), (6239, 6364), (75...
3 middlemarch.txt e0b/e0b-txt/Guth_1999_George Eliot and Schille... 6 3 5 1793446 46745 [(8798, 8850), (8870, 8930), (83611, 83687), (... [(18049, 18101), (18127, 18188), (19244, 19319...
4 middlemarch.txt e0b/e0b-txt/Payne_1999_The Serialist Vanishes.txt 6 3 4 1793446 60072 [(345200, 345488), (1608886, 1609417), (161169... [(4009, 4298), (25511, 26041), (26698, 26767),...

In [3]:
# Adapted from text-matcher
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        
    @property
    def text(self):
        """ Reads the file in memory. """
        f = open(self.filename, encoding='utf-8', errors='ignore')
        return f.read() 

    @property
    def tokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens

In [4]:
mm = Text('middlemarch.txt')

In [5]:
# Get the size of the text. 
textALength = df['Text A Length'][0]

# I don't know why, but without the offset the novel ends too soon,
# with "unvisited tomb." This fixes it. 
offset = 2
textALength += offset

# Make an empty array the size of the text. 
tally = np.zeros(textALength, dtype=np.int)

In [6]:
len(tally)


Out[6]:
1793448

In [7]:
min(1,2)


Out[7]:
1

In [8]:
# Read the locations from the CSV file, and literally evaluate them into lists. 
locations = df['Locations in A']
locations = locations.apply(literal_eval)

In [9]:
# Tally up every time a letter in the novel is quoted. 
for article in locations: 
    for locRange in article: 
         for i in range(locRange[0], min(locRange[1]+1, len(tally))):
                tally[i] += 1

In [10]:
# Make a color list in hex for all the values in the tally. 
# Let's hope there aren't too many. 
colors = list(Color("blue").range_to(Color("red"),tally.max()+1))
for color in colors: 
    color.set_luminance(0.42)
colorList = [color.get_hex() for color in colors]
colorList = ['#000'] + colorList[:-1]

In [11]:
cmax = 16
normalized = [round((i/cmax) * 255) for i in range(cmax)]
print(normalized)
hexes = [rgb2hex(x) for x in cm.gnuplot(normalized)]
colorList = hexes
colorList


[0, 16, 32, 48, 64, 80, 96, 112, 128, 143, 159, 175, 191, 207, 223, 239]
Out[11]:
['#000000',
 '#400062',
 '#5a01b5',
 '#6f02ec',
 '#8004ff',
 '#8f08eb',
 '#9c0eb3',
 '#a9165f',
 '#b52000',
 '#bf2d00',
 '#c93e00',
 '#d35200',
 '#dd6b00',
 '#e68800',
 '#eeab00',
 '#f7d200']

In [12]:
# Create a CSS Stylesheet for each color value in the map. 
colorCSS = ""
for i, color in zip(range(0, tally.max()+1), colorList): 
    colorCSS += ".c-%s { color: %s; }\n" % (i, color)

In [13]:
n = 50

checkpoints = np.linspace(0, textALength, n).round()
checkpoints = [int(point) for point in checkpoints]

In [14]:
def span(val): 
    return '<span class="c-%s">' % val

previousVal = None
for i, valChar in enumerate(zip(tally, mm.text)):
    val, char = valChar[0], valChar[1]
    if previousVal == None: 
        # First character. 
        out = '<span class="c-%s">' % val
    elif val != previousVal: 
        out += '</span><span class="c-%s">' % val
    if i in checkpoints: 
        out += '<a name="b-%s"></a>' % checkpoints.index(i)
    out += char
    previousVal = val

In [15]:
# Get dates
def getDate(filename): 
    """
    Extract dates from filenames. 
    """
    m = re.search('_(\d{4})_', filename)
    if m is not None: 
        return int(m.group(1))
    else:
        return None

df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)

# Make a list of valid decades. 
decades = np.arange(1930, 2020, 10)

# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = literal_eval(row['Locations in A'])
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations 
        
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}

decadesBinned = {decade: 
                 np.histogram(locations, bins=n, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}

decadesDF = pd.DataFrame(decadesBinned).T

In [16]:
# Normalize the totals for each section. 
normalizedBlocks = decadesDF.sum() / decadesDF.sum().max()

# Now use the scale that we're already using for the CSS. 
normalizedBlocks = round(normalizedBlocks * tally.max())

In [17]:
blockHTML = '<section id="blocks">'
for i, block in enumerate(normalizedBlocks): 
    blockHTML += '<div class="block b-%s"><a class="block" href="#b-%s">%s</a></div>' % (int(block), i, i)
blockHTML = blockHTML + "</section>"

In [18]:
blockCSS = """
#blocks {  }
.block, .block { 
  width: 30px; 
  height: 30px; 
  display: inline-block;
  color: white; 
  text-align: center;
  text-decoration: none;
  margin-top: 3px; 
}
"""

for i, color in zip(range(0, tally.max()+1), colorList): 
    blockCSS += '.b-%s { background-color: %s; }\n' % (i, color)
colorCSS += blockCSS

In [19]:
html = """<!DOCTYPE html>
<html>
<head>
  <link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet"> 
  <style>
  pre { 
      font-family: Raleway, serif; 
      font-size: 18px; 
    }
  main { 
      width: 40em; 
      margin: 2em auto; 
  }
  %s
  </style>
</head>
<body>%s<main><pre>%s</pre></main></body></html>
""" % (colorCSS, blockHTML, out)

In [20]:
with open('annotated.html', 'w') as f: 
    f.write(html)
    f.close()