In [1]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
In [2]:
# Read in the experiment results from the text-matcher log file.
df = pd.read_csv('e0b/log.txt')
In [3]:
def getDate(filename):
"""
Extract dates from filenames.
"""
m = re.search('_(\d{4})_', filename)
if m is not None:
return int(m.group(1))
else:
return None
df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)
In [4]:
df.head()
Out[4]:
In [5]:
df['Date'].hist()
Out[5]:
In [6]:
textALength = df['Text A Length'][0]
textALength
Out[6]:
In [7]:
decades = np.arange(1950, 2020, 10)
In [8]:
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locations = literal_eval(row['Locations in A'])
if decade not in decadeDict:
decadeDict[decade] = locations
else:
decadeDict[decade] += locations
In [9]:
# Grab the beginnings of quotes.
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
In [10]:
decadesBinned = {decade:
np.histogram(locations, bins=50, range=(0, textALength))[0]
for decade, locations in decadeStarts.items() if decade in decades}
In [11]:
decadesDF = pd.DataFrame(decadesBinned).T
In [12]:
decadesDF
Out[12]:
In [13]:
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
In [14]:
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']
In [54]:
plt.pcolor(decadesDF, cmap='gnuplot')
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Novel Segment')
plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
#plt.xticks(np.arange(len(decadesDF.columns)), decadesDF.columns)
plt.colorbar(ticks=[])
plt.show()
In [16]:
decadesDF.sum().plot(kind='bar')
Out[16]: