Experiment 0-D

This experiment used 483 texts scraped from JSTOR, which have 280 files with text matches. In an effort to make the corpus more diachronic, some of these texts were scraped from decade-based searches, i.e. a search for the keyword "Middlemarch" in articles from 1930 to 1939. This experiment was conducted with text-matcher 0.1.4, which fixes the issue with matches cutting off too early. It also uses a threshold of 6 words, in an attempt to avoid false positives.



In [1]:

    
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]



In [2]:

    
# Read in the experiment results from the text-matcher log file. 
df = pd.read_csv('e0d/log.txt')



In [3]:

    
def getDate(filename): 
    """
    Extract dates from filenames. 
    """
    m = re.search('_(\d{4})_', filename)
    if m is not None: 
        return int(m.group(1))
    else:
        return None

df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)



In [4]:

    
df.head()









    Out[4]:






  
    
      
      Text A
      Text B
      Threshold
      N-Grams
      Num Matches
      Text A Length
      Text B Length
      Locations in A
      Locations in B
      Date
      Decade
    
  
  
    
      0
      middlemarch.txt
      e0b/e0b-txt/WOLFE_2002_IRIS MURDOCH APPLIED TO...
      6
      3
      17
      1793446
      43119
      [(539109, 539353), (539391, 539432), (539504, ...
      [(26234, 26478), (26481, 26521), (26567, 26802...
      2002.0
      2000.0
    
    
      1
      middlemarch.txt
      e0b/e0b-txt/Hardy_1954_The Moment of Disenchan...
      6
      3
      3
      1793446
      20226
      [(580711, 580936), (580948, 581020), (1691325,...
      [(9662, 9886), (9897, 9969), (19879, 20002)]
      1954.0
      1950.0
    
    
      2
      middlemarch.txt
      e0b/e0b-txt/MORRIS_1990_THE DIALOGIC UNIVERSE ...
      6
      3
      21
      1793446
      47530
      [(1615, 1964), (29948, 30025), (40132, 40256),...
      [(4254, 4605), (5612, 5690), (6239, 6364), (75...
      1990.0
      1990.0
    
    
      3
      middlemarch.txt
      e0b/e0b-txt/Guth_1999_George Eliot and Schille...
      6
      3
      5
      1793446
      46745
      [(8798, 8850), (8870, 8930), (83611, 83687), (...
      [(18049, 18101), (18127, 18188), (19244, 19319...
      1999.0
      1990.0
    
    
      4
      middlemarch.txt
      e0b/e0b-txt/Payne_1999_The Serialist Vanishes.txt
      6
      3
      4
      1793446
      60072
      [(345200, 345488), (1608886, 1609417), (161169...
      [(4009, 4298), (25511, 26041), (26698, 26767),...
      1999.0
      1990.0



In [5]:

    
df['Date'].hist()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fb92813dda0>



In [6]:

    
textALength = df['Text A Length'][0]
textALength









    Out[6]:





1793446



In [7]:

    
decades = np.arange(1950, 2020, 10)



In [8]:

    
# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = literal_eval(row['Locations in A'])
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations



In [9]:

    
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}



In [10]:

    
decadesBinned = {decade: 
                 np.histogram(locations, bins=50, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}



In [11]:

    
decadesDF = pd.DataFrame(decadesBinned).T



In [12]:

    
decadesDF









    Out[12]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
    
  
  
    
      1950.0
      5
      5
      0
      1
      0
      2
      0
      0
      0
      0
      ...
      0
      0
      4
      6
      3
      1
      0
      44
      5
      4
    
    
      1960.0
      6
      3
      1
      5
      2
      4
      0
      0
      12
      4
      ...
      2
      0
      0
      5
      0
      8
      3
      17
      3
      11
    
    
      1970.0
      40
      19
      3
      22
      6
      5
      3
      0
      10
      15
      ...
      0
      2
      0
      2
      9
      1
      1
      18
      2
      17
    
    
      1980.0
      31
      13
      5
      6
      3
      6
      6
      7
      38
      7
      ...
      0
      0
      1
      11
      10
      13
      1
      15
      0
      26
    
    
      1990.0
      35
      13
      12
      4
      3
      12
      2
      12
      51
      16
      ...
      0
      2
      8
      1
      9
      2
      15
      32
      8
      19
    
    
      2000.0
      45
      10
      10
      1
      19
      10
      2
      6
      11
      6
      ...
      0
      5
      7
      2
      4
      3
      6
      7
      6
      8
    
    
      2010.0
      47
      3
      5
      2
      10
      0
      2
      1
      9
      5
      ...
      2
      0
      1
      4
      7
      3
      7
      6
      1
      21
    
  

7 rows × 50 columns



In [13]:

    
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)



In [14]:

    
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']



In [15]:

    
plt.pcolor(decadesDF)
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Quotes Per Novel Segment')
plt.title("Number of Critical Quotations from George Eliot's Middlemarch, By Decade")
#plt.xticks(np.arange(len(decadesDF.columns)), decadesDF.columns)
plt.show()



In [16]:

    
decadesDF.sum().plot(kind='bar')









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fb922c83358>



In [ ]:

	Text A	Text B	Threshold	N-Grams	Num Matches	Text A Length	Text B Length	Locations in A	Locations in B	Date	Decade
0	middlemarch.txt	e0b/e0b-txt/WOLFE_2002_IRIS MURDOCH APPLIED TO...	6	3	17	1793446	43119	[(539109, 539353), (539391, 539432), (539504, ...	[(26234, 26478), (26481, 26521), (26567, 26802...	2002.0	2000.0
1	middlemarch.txt	e0b/e0b-txt/Hardy_1954_The Moment of Disenchan...	6	3	3	1793446	20226	[(580711, 580936), (580948, 581020), (1691325,...	[(9662, 9886), (9897, 9969), (19879, 20002)]	1954.0	1950.0
2	middlemarch.txt	e0b/e0b-txt/MORRIS_1990_THE DIALOGIC UNIVERSE ...	6	3	21	1793446	47530	[(1615, 1964), (29948, 30025), (40132, 40256),...	[(4254, 4605), (5612, 5690), (6239, 6364), (75...	1990.0	1990.0
3	middlemarch.txt	e0b/e0b-txt/Guth_1999_George Eliot and Schille...	6	3	5	1793446	46745	[(8798, 8850), (8870, 8930), (83611, 83687), (...	[(18049, 18101), (18127, 18188), (19244, 19319...	1999.0	1990.0
4	middlemarch.txt	e0b/e0b-txt/Payne_1999_The Serialist Vanishes.txt	6	3	4	1793446	60072	[(345200, 345488), (1608886, 1609417), (161169...	[(4009, 4298), (25511, 26041), (26698, 26767),...	1999.0	1990.0

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
1950.0	5	5	0	1	0	2	0	0	0	0	...	0	0	4	6	3	1	0	44	5	4
1960.0	6	3	1	5	2	4	0	0	12	4	...	2	0	0	5	0	8	3	17	3	11
1970.0	40	19	3	22	6	5	3	0	10	15	...	0	2	0	2	9	1	1	18	2	17
1980.0	31	13	5	6	3	6	6	7	38	7	...	0	0	1	11	10	13	1	15	0	26
1990.0	35	13	12	4	3	12	2	12	51	16	...	0	2	8	1	9	2	15	32	8	19
2000.0	45	10	10	1	19	10	2	6	11	6	...	0	5	7	2	4	3	6	7	6	8
2010.0	47	3	5	2	10	0	2	1	9	5	...	2	0	1	4	7	3	7	6	1	21