Experiment 0-A

This experiment used 344 texts scraped from JSTOR, of which 220 had text matches.



In [1]:

    
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]



In [2]:

    
df = pd.read_csv('e0a/log.txt')



In [3]:

    
def getDate(filename): 
    """
    Extract dates from filenames. 
    """
    m = re.search('_(\d{4})_', filename)
    if m is not None: 
        return int(m.group(1))
    else:
        return None

df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)



In [4]:

    
df.head()









    Out[4]:






  
    
      
      Text A
      Text B
      Threshold
      N-Grams
      Num Matches
      Text A Length
      Text B Length
      Locations in A
      Locations in B
      Date
      Decade
    
  
  
    
      0
      middlemarch.txt
      txt/Nunokawa_1993_The Miser's Two Bodies.pdf.txt
      2
      3
      2
      1793446
      62029
      [(1662476, 1662946), (1712058, 1712098)]
      [(8779, 9249), (32760, 32797)]
      1993.0
      1990.0
    
    
      1
      middlemarch.txt
      txt/Staten_2000_Is Middlemarch Ahistorical.pdf...
      2
      3
      3
      1793446
      66758
      [(313509, 313801), (739656, 739671), (739694, ...
      [(40984, 41278), (42171, 42186), (42350, 42459)]
      2000.0
      2000.0
    
    
      2
      middlemarch.txt
      txt/Deery_1985_Margaret Fuller and Dorothea Br...
      2
      3
      6
      1793446
      21765
      [(47018, 47035), (68821, 68901), (95818, 95853...
      [(7697, 7718), (8515, 8596), (8705, 8740), (92...
      1985.0
      1980.0
    
    
      3
      middlemarch.txt
      txt/McGeer_2004_The Art of Good Hope.pdf.txt
      2
      3
      5
      1793446
      56683
      [(42704, 42735), (527924, 527947), (1326632, 1...
      [(20849, 20880), (24073, 24096), (26993, 27130...
      2004.0
      2000.0
    
    
      4
      middlemarch.txt
      txt/HAWES_1992_GEORGE ELIOTS SAYINGS.pdf.txt
      2
      3
      1
      1793446
      13767
      [(1327591, 1327734)]
      [(6577, 6720)]
      1992.0
      1990.0



In [5]:

    
df['Date'].hist()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1a91004da0>



In [6]:

    
textALength = df['Text A Length'][0]
textALength









    Out[6]:





1793446



In [7]:

    
decades = np.arange(1930, 2020, 10)



In [8]:

    
# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = literal_eval(row['Locations in A'])
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations



In [9]:

    
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}



In [22]:

    
decadesBinned = {decade: 
                 np.histogram(locations, bins=50, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}



In [23]:

    
decadesDF = pd.DataFrame(decadesBinned).T



In [24]:

    
decadesDF









    Out[24]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
    
  
  
    
      1950.0
      5
      3
      0
      2
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      1
      4
      0
      1
      0
      44
      4
      2
    
    
      1960.0
      7
      0
      1
      1
      0
      1
      0
      0
      8
      2
      ...
      4
      0
      0
      6
      0
      4
      1
      33
      2
      4
    
    
      1970.0
      39
      14
      2
      13
      6
      1
      2
      0
      5
      13
      ...
      0
      0
      1
      2
      7
      1
      1
      12
      1
      17
    
    
      1980.0
      30
      18
      7
      5
      3
      6
      6
      9
      22
      12
      ...
      0
      0
      0
      8
      3
      10
      1
      18
      3
      12
    
    
      1990.0
      35
      11
      9
      14
      2
      15
      2
      10
      37
      9
      ...
      0
      2
      3
      1
      8
      2
      15
      27
      6
      15
    
    
      2000.0
      40
      12
      10
      0
      18
      9
      0
      7
      6
      8
      ...
      0
      5
      6
      3
      3
      3
      7
      7
      4
      7
    
    
      2010.0
      45
      5
      4
      1
      11
      0
      2
      1
      7
      5
      ...
      2
      0
      1
      4
      6
      3
      7
      5
      1
      20
    
  

7 rows × 50 columns



In [25]:

    
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']



In [26]:

    
plt.pcolor(decadesDF)
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Quotes Per Novel Segment')
plt.title("Number of Critical Quotations from George Eliot's Middlemarch, By Decade")
#plt.xticks(np.arange(len(decadesDF.columns)), decadesDF.columns)
plt.show()



In [21]:

    
decadesDF.sum().plot(kind='bar')









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1a6baf9630>



In [ ]:

	Text A	Text B	Threshold	N-Grams	Num Matches	Text A Length	Text B Length	Locations in A	Locations in B	Date	Decade
0	middlemarch.txt	txt/Nunokawa_1993_The Miser's Two Bodies.pdf.txt	2	3	2	1793446	62029	[(1662476, 1662946), (1712058, 1712098)]	[(8779, 9249), (32760, 32797)]	1993.0	1990.0
1	middlemarch.txt	txt/Staten_2000_Is Middlemarch Ahistorical.pdf...	2	3	3	1793446	66758	[(313509, 313801), (739656, 739671), (739694, ...	[(40984, 41278), (42171, 42186), (42350, 42459)]	2000.0	2000.0
2	middlemarch.txt	txt/Deery_1985_Margaret Fuller and Dorothea Br...	2	3	6	1793446	21765	[(47018, 47035), (68821, 68901), (95818, 95853...	[(7697, 7718), (8515, 8596), (8705, 8740), (92...	1985.0	1980.0
3	middlemarch.txt	txt/McGeer_2004_The Art of Good Hope.pdf.txt	2	3	5	1793446	56683	[(42704, 42735), (527924, 527947), (1326632, 1...	[(20849, 20880), (24073, 24096), (26993, 27130...	2004.0	2000.0
4	middlemarch.txt	txt/HAWES_1992_GEORGE ELIOTS SAYINGS.pdf.txt	2	3	1	1793446	13767	[(1327591, 1327734)]	[(6577, 6720)]	1992.0	1990.0

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
1950.0	5	3	0	2	0	0	0	0	0	0	...	0	0	1	4	0	1	0	44	4	2
1960.0	7	0	1	1	0	1	0	0	8	2	...	4	0	0	6	0	4	1	33	2	4
1970.0	39	14	2	13	6	1	2	0	5	13	...	0	0	1	2	7	1	1	12	1	17
1980.0	30	18	7	5	3	6	6	9	22	12	...	0	0	0	8	3	10	1	18	3	12
1990.0	35	11	9	14	2	15	2	10	37	9	...	0	2	3	1	8	2	15	27	6	15
2000.0	40	12	10	0	18	9	0	7	6	8	...	0	5	6	3	3	3	7	7	4	7
2010.0	45	5	4	1	11	0	2	1	7	5	...	2	0	1	4	6	3	7	5	1	20