Experiment 0-B

This experiment used 483 texts scraped from JSTOR, which have 280 files with text matches. In an effort to make the corpus more diachronic, some of these texts were scraped from decade-based searches, i.e. a search for the keyword "Middlemarch" in articles from 1930 to 1939.


In [1]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]

In [2]:
# Read in the experiment results from the text-matcher log file. 
df = pd.read_csv('e0b/log.txt')

In [3]:
def getDate(filename): 
    """
    Extract dates from filenames. 
    """
    m = re.search('_(\d{4})_', filename)
    if m is not None: 
        return int(m.group(1))
    else:
        return None

df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)

In [4]:
df.head()


Out[4]:
Text A Text B Threshold N-Grams Num Matches Text A Length Text B Length Locations in A Locations in B Date Decade
0 middlemarch.txt e0b-txt/WOLFE_2002_IRIS MURDOCH APPLIED TO GEO... 2 3 15 1793446 43119 [(539115, 539335), (539512, 539731), (1210547,... [(26240, 26460), (26575, 26793), (27942, 28243... 2002.0 2000.0
1 middlemarch.txt e0b-txt/Hardy_1954_The Moment of Disenchantmen... 2 3 4 1793446 20226 [(580715, 580913), (580948, 581003), (581021, ... [(9666, 9863), (9897, 9952), (10132, 10161), (... 1954.0 1950.0
2 middlemarch.txt e0b-txt/MORRIS_1990_THE DIALOGIC UNIVERSE OF M... 2 3 18 1793446 47530 [(1620, 1951), (40145, 40236), (308564, 308874... [(4259, 4591), (6252, 6343), (7558, 7868), (10... 1990.0 1990.0
3 middlemarch.txt e0b-txt/Guth_1999_George Eliot and Schiller.txt 2 3 3 1793446 46745 [(8798, 8828), (83611, 83653), (1786635, 17866... [(18049, 18079), (19244, 19285), (27895, 27934)] 1999.0 1990.0
4 middlemarch.txt e0b-txt/Payne_1999_The Serialist Vanishes.txt 2 3 3 1793446 60072 [(345200, 345470), (1608890, 1609386), (161169... [(4009, 4280), (25515, 26010), (26702, 26740)] 1999.0 1990.0

In [5]:
df['Date'].hist()


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc8bf53c390>

In [6]:
textALength = df['Text A Length'][0]
textALength


Out[6]:
1793446

In [7]:
decades = np.arange(1950, 2020, 10)

In [8]:
# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = literal_eval(row['Locations in A'])
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations

In [9]:
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}

In [10]:
decadesBinned = {decade: 
                 np.histogram(locations, bins=50, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}

In [11]:
decadesDF = pd.DataFrame(decadesBinned).T

In [12]:
decadesDF


Out[12]:
0 1 2 3 4 5 6 7 8 9 ... 40 41 42 43 44 45 46 47 48 49
1950.0 6 4 0 2 0 2 0 0 0 0 ... 0 0 4 7 3 1 0 45 4 3
1960.0 9 2 1 4 2 4 0 0 12 4 ... 2 0 0 6 0 5 3 20 2 9
1970.0 41 14 2 19 5 4 3 0 9 18 ... 0 0 1 2 11 1 1 15 1 15
1980.0 31 18 7 5 3 6 6 9 33 12 ... 0 0 0 9 10 10 1 17 3 24
1990.0 36 12 10 14 2 15 2 10 37 9 ... 0 2 8 1 8 2 15 27 6 15
2000.0 46 16 10 1 18 9 0 7 6 9 ... 0 5 6 3 3 3 7 7 4 7
2010.0 45 5 4 1 11 0 2 1 9 5 ... 2 0 1 4 6 3 7 5 1 21

7 rows × 50 columns


In [13]:
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)

In [14]:
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']

In [54]:
plt.pcolor(decadesDF, cmap='gnuplot')
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Novel Segment')
plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
#plt.xticks(np.arange(len(decadesDF.columns)), decadesDF.columns)
plt.colorbar(ticks=[])
plt.show()



In [16]:
decadesDF.sum().plot(kind='bar')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc8bed436d8>