In [11]:
import pandas as pd
import nltk
%matplotlib inline
import math
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
from colour import Color
from IPython.core.display import HTML
from matplotlib import cm
from matplotlib.colors import rgb2hex
plt.rcParams["figure.figsize"] = [16, 6]
In [12]:
df = pd.read_json('../txt/e2a.json')
In [14]:
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
df['Decade'] = df['year'] - (df['year'] % 10)
In [15]:
# Adapted from text-matcher
class Text:
def __init__(self, filename):
self.filename = filename
@property
def text(self):
""" Reads the file in memory. """
f = open(self.filename, encoding='utf-8', errors='ignore')
return f.read()
@property
def tokens(self, removeStopwords=True):
""" Tokenizes the text, breaking it up into words, removing punctuation. """
tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer.
spans = list(tokenizer.span_tokenize(self.text))
# Take note of how many spans there are in the text
self.length = spans[-1][-1]
tokens = tokenizer.tokenize(self.text)
tokens = [ token.lower() for token in tokens ] # make them lowercase
if not removeStopwords:
self.spans = spans
return tokens
tokenSpans = list(zip(tokens, spans)) # zip it up
stopwords = nltk.corpus.stopwords.words('english') # get stopwords
tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
return [ x[0] for x in tokenSpans ] # unzip; get tokens
In [16]:
mm = Text('../middlemarch.txt')
In [17]:
# Get the size of the text.
textALength = 1793449
# I don't know why, but without the offset the novel ends too soon,
# with "unvisited tomb." This fixes it.
offset = 2
textALength += offset
# Make an empty array the size of the text.
tally = np.zeros(textALength, dtype=np.int)
In [18]:
len(tally)
Out[18]:
In [20]:
# Read the locations from the CSV file, and literally evaluate them into lists.
locations = df['Locations in A']
# locations = locations.apply(literal_eval)
In [21]:
type(locations[0])
Out[21]:
In [22]:
# Tally up every time a letter in the novel is quoted.
for article in locations:
for locRange in article:
for i in range(locRange[0], min(locRange[1]+1, len(tally))):
tally[i] += 1
In [23]:
mm.text[1793218:1793418]
Out[23]:
In [42]:
# Make a color list in hex for all the values in the tally.
# Let's hope there aren't too many.
normalized = [int(round((i/tally.max() * 255))) for i in range(tally.max()+1)]
colorList = [rgb2hex(x) for x in cm.gnuplot(normalized)]
In [43]:
# Create a CSS Stylesheet for each color value in the map.
colorCSS = ""
for i, color in zip(range(0, tally.max()+1), colorList):
colorCSS += ".c-%s { color: %s; }\n" % (i, color)
In [46]:
n = 50
checkpoints = np.linspace(0, textALength, n).round()
checkpoints = [int(point) for point in checkpoints]
In [47]:
def span(val):
return '<span class="c-%s">' % val
previousVal = None
for i, valChar in enumerate(zip(tally, mm.text)):
val, char = valChar[0], valChar[1]
if previousVal == None:
# First character.
out = '<span class="c-%s">' % val
elif val != previousVal:
out += '</span><span class="c-%s">' % val
if i in checkpoints:
out += '<a name="b-%s"></a>' % checkpoints.index(i)
out += char
previousVal = val
In [48]:
len(df)
Out[48]:
In [49]:
# Make a list of valid decades.
decades = np.arange(1930, 2020, 10)
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locations = row['Locations in A']
if decade not in decadeDict:
decadeDict[decade] = locations
else:
decadeDict[decade] += locations
In [50]:
# Grab the beginnings of quotes.
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
decadesBinned = {decade:
np.histogram(locations, bins=n, range=(0, textALength))[0]
for decade, locations in decadeStarts.items() if decade in decades}
decadesDF = pd.DataFrame(decadesBinned).T
In [51]:
# Normalize the totals for each section.
normalizedBlocks = decadesDF.sum() / decadesDF.sum().max()
# Now use the scale that we're already using for the CSS.
normalizedBlocks = round(normalizedBlocks * tally.max())
In [52]:
blockHTML = '<section id="blocks">'
for i, block in enumerate(normalizedBlocks):
blockHTML += '<div class="block b-%s"><a class="block" href="#b-%s">%s</a></div>' % (int(block), i, i)
blockHTML = blockHTML + "</section>"
In [53]:
blockCSS = """
#blocks { }
.block, .block {
width: 30px;
height: 30px;
display: inline-block;
color: white;
text-align: center;
text-decoration: none;
margin-top: 3px;
}
"""
for i, color in zip(range(0, tally.max()+1), colorList):
blockCSS += '.b-%s { background-color: %s; }\n' % (i, color)
colorCSS += blockCSS
In [54]:
html = """<!DOCTYPE html>
<html>
<head>
<link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet">
<style>
pre {
font-family: Raleway, serif;
font-size: 18px;
font-weight: 600;
}
main {
width: 40em;
margin: 2em auto;
}
%s
</style>
</head>
<body>%s<main><pre>%s</pre></main></body></html>
""" % (colorCSS, blockHTML, out)
In [55]:
with open('annotated.html', 'w') as f:
f.write(html)
f.close()
In [ ]: