This experiment used 483 texts scraped from JSTOR, which have 280 files with text matches. In an effort to make the corpus more diachronic, some of these texts were scraped from decade-based searches, i.e. a search for the keyword "Middlemarch" in articles from 1930 to 1939. This experiment was conducted with text-matcher 0.1.4, which fixes the issue with matches cutting off too early. It also uses a threshold of 6 words, in an attempt to avoid false positives.
In [1]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
In [2]:
# Read in the experiment results from the text-matcher log file.
df = pd.read_csv('e0d/log.txt')
In [3]:
def getDate(filename):
"""
Extract dates from filenames.
"""
m = re.search('_(\d{4})_', filename)
if m is not None:
return int(m.group(1))
else:
return None
df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)
In [4]:
df.head()
Out[4]:
In [5]:
df['Date'].hist()
Out[5]:
In [6]:
textALength = df['Text A Length'][0]
textALength
Out[6]:
In [7]:
decades = np.arange(1950, 2020, 10)
In [8]:
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locations = literal_eval(row['Locations in A'])
if decade not in decadeDict:
decadeDict[decade] = locations
else:
decadeDict[decade] += locations
In [9]:
# Grab the beginnings of quotes.
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
In [10]:
decadesBinned = {decade:
np.histogram(locations, bins=50, range=(0, textALength))[0]
for decade, locations in decadeStarts.items() if decade in decades}
In [11]:
decadesDF = pd.DataFrame(decadesBinned).T
In [12]:
decadesDF
Out[12]:
In [13]:
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
In [14]:
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']
In [15]:
plt.pcolor(decadesDF)
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Quotes Per Novel Segment')
plt.title("Number of Critical Quotations from George Eliot's Middlemarch, By Decade")
#plt.xticks(np.arange(len(decadesDF.columns)), decadesDF.columns)
plt.show()
In [16]:
decadesDF.sum().plot(kind='bar')
Out[16]:
In [ ]: