Analysis of Text Matching Data Generated from JSTOR Dataset



In [3]:

    
import pandas as pd
import numpy as np
#import spacy
import re
import json
import altair as alt
#from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')









    



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-3-3f1a51beabfb> in <module>()
      4 import re
      5 import json
----> 6 import altair as alt
      7 #from nltk.corpus import names
      8 from collections import Counter

ModuleNotFoundError: No module named 'altair'



In [ ]:

    
with open('../middlemarch.txt') as f: 
    mm = f.read()



In [ ]:

    
textALength = len(mm)



In [ ]:

    
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk. 
len(chapterLocations)



In [ ]:

    
# Get book locations
bookLocations = [match.start() for match in re.finditer('\nBOOK', mm)]
bookLocations = [0] + bookLocations + [textALength] # Add one to account for last chunk.
bookLocations



In [4]:

    
def getChapters(text): 
    chapters = []
    for i, loc in enumerate(chapterLocations): 
        if i != len(chapterLocations)-1: 
            chapter = mm[loc:chapterLocations[i+1]]
            chapters.append(chapter)
    return chapters



In [5]:

    
chapters = getChapters(mm)
chapterLengths = [len(chapter.split()) for chapter in chapters]
chapterLengthsSeries = pd.Series(chapterLengths)
chapterLengthsSeries.plot(kind='bar', title='Chapter Lengths')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-560a218e09a9> in <module>()
----> 1 chapters = getChapters(mm)
      2 chapterLengths = [len(chapter.split()) for chapter in chapters]
      3 chapterLengthsSeries = pd.Series(chapterLengths)
      4 chapterLengthsSeries.plot(kind='bar', title='Chapter Lengths')

NameError: name 'mm' is not defined



In [6]:

    
df = pd.read_json('../data/e3.json')



In [7]:

    
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])



In [8]:

    
sum([len(item) for item in df['Locations in A'].values])









    Out[8]:





1794

How many articles do we have?



In [9]:

    
len(df) # Total articles with "Middlemarch" mentioned somewhere









    Out[9]:





6069

Find only those with non-trivial quotations from Middlemarch:



In [10]:

    
articlesWithMatches = df[df['Locations in A'].apply(lambda x: len(x) > 0)]
articlesWithMatches.year.describe()









    Out[10]:





count     489.000000
mean     1991.871166
std        18.477106
min      1873.000000
25%      1980.000000
50%      1995.000000
75%      2007.000000
max      2016.000000
Name: year, dtype: float64



In [11]:

    
articlesWithMatches.Wordcounts.apply(len).head()









    Out[11]:





0       17
10      12
100      3
1000     2
1008     2
Name: Wordcounts, dtype: int64



In [12]:

    
# articlesWithMatches.to_json('../data/cleaned-matches.json')

How many articles do we have published in each year?



In [13]:

    
alt.Chart(articlesWithMatches).mark_bar().encode(x='year:O', y='count()').properties(width=1000)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-9c65a1333e48> in <module>()
----> 1 alt.Chart(articlesWithMatches).mark_bar().encode(x='year:O', y='count()').properties(width=1000)

NameError: name 'alt' is not defined



In [14]:

    
df.columns









    Out[14]:





Index(['Locations in A', 'Locations in B', 'author', 'coverdate', 'disc_name',
       'doi', 'id', 'jcode', 'journal', 'la', 'no', 'numMatches', 'pages',
       'publisher_name', 'sp', 'srcHtml', 'title', 'topics', 'ty', 'vo',
       'year', 'Decade', 'Quoted Words', 'Locations in A with Wordcounts',
       'Wordcounts'],
      dtype='object')



In [15]:

    
df[df['Quoted Words'] > 0]['disc_name'].value_counts().head()









    Out[15]:





[Language & Literature, Humanities]                                                     201
[Language & Literature]                                                                  72
[Humanities, Language & Literature]                                                      59
[Language & Literature, History, British Studies, History, Area Studies, Humanities]     15
[Area Studies, British Studies, Humanities, Language & Literature]                        9
Name: disc_name, dtype: int64

Find Number of Garbage Articles

I.e., articles that just contain front matter, contents, etc.



In [16]:

    
def isGarbage(itemTitle): 
    badTitles = ['front matter', 'back matter', 'table of contents', 'cover']
    if itemTitle == None: 
        return False
    for title in itemTitle: 
        for badTitle in badTitles: 
            if badTitle in title.lower(): 
                return True
    return False



In [17]:

    
len(df[df.title.apply(isGarbage)]) # How many garbage items?









    Out[17]:





457

Average Numbers of Quoted Words Per Item



In [18]:

    
df['Quoted Words'].describe()









    Out[18]:





count    6069.000000
mean       19.240896
std       105.153455
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      2498.000000
Name: Quoted Words, dtype: float64



In [19]:

    
articlesWithMatches['Quoted Words'].describe()









    Out[19]:





count     489.000000
mean      238.799591
std       291.466263
min        13.000000
25%        68.000000
50%       141.000000
75%       281.000000
max      2498.000000
Name: Quoted Words, dtype: float64



In [20]:

    
len(df[df['Quoted Words'] > 0])









    Out[20]:





489



In [21]:

    
articlesWithMatches['Quoted Words'].hist()









    



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-21-606aed2e52f7> in <module>()
----> 1 articlesWithMatches['Quoted Words'].hist()

/usr/local/lib/python3.7/site-packages/pandas/plotting/_core.py in hist_series(self, by, ax, grid, xlabelsize, xrot, ylabelsize, yrot, figsize, bins, **kwds)
   2461 
   2462     """
-> 2463     import matplotlib.pyplot as plt
   2464 
   2465     if by is None:

ModuleNotFoundError: No module named 'matplotlib'

Stats about Wordcounts

Average number of words per match, per item:



In [22]:

    
articlesWithMatches['Wordcounts'].apply(np.mean).head()









    Out[22]:





0        67.235294
10       62.416667
100      34.666667
1000     97.500000
1008    113.500000
Name: Wordcounts, dtype: float64



In [23]:

    
articlesWithMatches['Wordcounts'].apply(np.mean).describe()









    Out[23]:





count    489.000000
mean      69.712270
std       50.153803
min       13.000000
25%       37.000000
50%       58.500000
75%       86.000000
max      435.333333
Name: Wordcounts, dtype: float64



In [24]:

    
def diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations, useWordcounts=True, normalize=True):
    """ Turning on useWordcounts makes it so that it's weighted by wordcount. 
    Turning it off uses raw numbers of quotations. """
    decades = np.arange(decades[0], decades[1], 10)
    # Make a dictionary of decades. 
    # Values are a list of locations.  
    decadeDict = {}
    for i, row in df.iterrows():
        decade = row['Decade']
        locationsAndWordcounts = row['Locations in A with Wordcounts']
        if decade not in decadeDict: 
            decadeDict[decade] = locationsAndWordcounts.copy()
        else: 
            decadeDict[decade] += locationsAndWordcounts.copy()
    # Grab the beginnings of quotes. 
    decadeStartsWeights = {decade: [(item[0][0], item[1]) 
                                    for item in loc] 
                    for decade, loc in decadeDict.items()}
    if useWordcounts: 
        decadesBinned = {decade: 
                     np.histogram([loc[0] for loc in locations], 
                                  bins=bins,
                                  weights=[loc[1] for loc in locations],
                                  range=(0, textALength))[0]
                     for decade, locations in decadeStartsWeights.items() 
                         if decade in decades}
    else: 
        decadesBinned = {decade: 
                     np.histogram([loc[0] for loc in locations], 
                                  bins=bins,
                                  range=(0, textALength))[0]
                     for decade, locations in decadeStartsWeights.items() 
                         if decade in decades}
    decadesDF = pd.DataFrame(decadesBinned).T
    #Normalize
    if normalize: 
        decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
    return decadesDF

def countWords(locRange): 
    """ Counts words in middlemarch, given character ranges. """
    chunk = mm[locRange[0]:locRange[1]]
    return len(chunk.split())

def totalWords(locRangeSet): 
    """ Counts total words in a list of location ranges. """
    return sum([countWords(locRange) for locRange in locRangeSet])    
    
def countsPerSet(locRangeSet): 
    """ Returns an augmented location range set that includes word counts. """
    return [(locRange, countWords(locRange))
             for locRange in locRangeSet]
    
def extractWordcounts(locsAndWordcounts): 
    """ 
    Takes pairs of location ranges and wordcounts, 
    and returns just the wordcounts. 
    """
    return [item[1] for item in locsAndWordcounts 
            if len(locsAndWordcounts) > 0]

def synchronicAnalysis(df, bins=chapterLocations, useWordcounts=True): 
    locs = df['Locations in A'].values
    locCounts = [(loc, countWords(loc)) for locSet in locs
              for loc in locSet]
    starts = [loc[0][0] for loc in locCounts]
    counts = [loc[1] for loc in locCounts]
    if useWordcounts: 
        binned = np.histogram(starts, bins=bins, 
                              weights=counts, range=(0, textALength))
    else: 
        binned = np.histogram(starts, bins=bins, 
                              range=(0, textALength))
    binnedDF = pd.Series(binned[0])
    return binnedDF

def plotDiachronicAnalysis(df, save=False, reverse=False): 
    ylabels = [str(int(decade)) for decade in df.index] + ['2020']
    plt.pcolor(df, cmap='gnuplot')
    plt.yticks(np.arange(len(df.index)+1), ylabels)
    plt.gca().invert_yaxis()
    plt.ylabel('Decade')
    plt.xlabel('Chapter')
    plt.gca().set_xlim((0, len(df.T)))
    plt.colorbar(ticks=[])
    if save: 
        plt.savefig('diachronic.png', bboxinches='tight', dpi=300, transparent=True)
    plt.show()
    
def plotSynchronicAnalysis(s, useWordcounts=True): 
    ax = s.plot(kind='bar')
    ax.set_xlabel('Chapter')
    if useWordcounts: 
        ax.set_ylabel('Number of Words Quoted')
    else: 
        ax.set_ylabel('Number of Quotations')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-a3c2130ae7cd> in <module>()
----> 1 def diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations, useWordcounts=True, normalize=True):
      2     """ Turning on useWordcounts makes it so that it's weighted by wordcount. 
      3     Turning it off uses raw numbers of quotations. """
      4     decades = np.arange(decades[0], decades[1], 10)
      5     # Make a dictionary of decades.

NameError: name 'chapterLocations' is not defined



In [25]:

    
df['Quoted Words'] = df['Locations in A'].apply(totalWords)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-d7d587d01874> in <module>()
----> 1 df['Quoted Words'] = df['Locations in A'].apply(totalWords)

NameError: name 'totalWords' is not defined



In [24]:

    
df['Locations in A with Wordcounts'] = df['Locations in A'].apply(countsPerSet)



In [25]:

    
# Verify that the diachronic wordcounts are the same as the synchronic wordcounts
decadeSums = diachronicAnalysis(df, decades=(1700, 2020), useWordcounts=True, normalize=False).sum(axis=1)
decadeSums.sum()









    Out[25]:





95120.0



In [26]:

    
chapterSums = synchronicAnalysis(df)
chapterSums.sum()









    Out[26]:





95120

Quotation Length Statistics



In [27]:

    
df['Wordcounts'] = df['Locations in A with Wordcounts'].apply(extractWordcounts)



In [28]:

    
wordcounts = []
for countSet in df['Wordcounts'].values: 
    for count in countSet: 
        wordcounts.append(count)



In [29]:

    
pd.Series(wordcounts).hist()









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f04e57d5908>

Number of Quotes (and words Quoted) by Chapter



In [30]:

    
plotSynchronicAnalysis(synchronicAnalysis(df))



In [122]:

    
synchronicAnalysis(df, useWordcounts=True).to_csv('../papers/spring2017-middlemarch-paper/data/num-words-quoted-per-chapter.csv')









    



/usr/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.
  """Entry point for launching an IPython kernel.



In [126]:

    
allMatches = []
for group in df['Locations in A'].values: 
    for pair in group: 
        allMatches.append(pair)



In [127]:

    
len(allMatches)









    Out[127]:





1794



In [21]:

    
plotSynchronicAnalysis(synchronicAnalysis(df, useWordcounts=False), useWordcounts=False)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-862f46ad7453> in <module>
----> 1 plotSynchronicAnalysis(synchronicAnalysis(df, useWordcounts=False), useWordcounts=False)

NameError: name 'plotSynchronicAnalysis' is not defined

Quotations Per Book



In [82]:

    
quotationsPerBook = synchronicAnalysis(df, bins=bookLocations, useWordcounts=False)
quotationsPerBook









    Out[82]:





0     70
1    405
2    460
3    164
4    171
5    111
6    104
7     49
8    260
dtype: int64



In [100]:

    
quotationsPerBook = pd.DataFrame(quotationsPerBook, index=range(0,9), columns=['# Quotations'])
quotationsPerBook['Book'] = range(0, 9)
quotationsPerBook









    Out[100]:







  
    
      
      # Quotations
      Book
    
  
  
    
      0
      70
      0
    
    
      1
      405
      1
    
    
      2
      460
      2
    
    
      3
      164
      3
    
    
      4
      171
      4
    
    
      5
      111
      5
    
    
      6
      104
      6
    
    
      7
      49
      7
    
    
      8
      260
      8



In [102]:

    
alt.Chart(quotationsPerBook).mark_bar().encode(x='Book:O', y='# Quotations:Q').properties(width=500)









    Out[102]:

Raw Number of Quotations Per Chapter



In [114]:

    
# Get the raw number of quotations per chapter
# synchronicAnalysis(df, useWordcounts=False).to_csv('../papers/spring2017-middlemarch-paper/data/num-quotations-per-chapter.csv')



In [113]:

    
# Adjusted for the number of words in each chapter
ax = (synchronicAnalysis(df) / chapterLengthsSeries).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Words Quoted, Normalized')









    Out[113]:





Text(0, 0.5, 'Words Quoted, Normalized')



In [107]:

    
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations).sort_index())



In [115]:

    
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1960, 2020), bins=chapterLocations).sort_index())



In [148]:

    
diaDF = diachronicAnalysis(df, decades=(1960, 2020), bins=chapterLocations).sort_index()

Redo chart in Altair



In [149]:

    
diaDF.columns.name = 'chapter'
diaDF.index.name = 'decade'



In [156]:

    
diaDF









    Out[156]:







  
    
      chapter
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
    
    
      decade
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1960
      0.054234
      0.098953
      0.197907
      0.064700
      0.000000
      0.000000
      0.000000
      0.081827
      0.000000
      0.254995
      ...
      0.142721
      0.039962
      0.166508
      0.134158
      0.000000
      0.000000
      0.0
      0.0
      0.000000
      0.093245
    
    
      1970
      0.138408
      0.346021
      0.611592
      0.252595
      0.096886
      0.074394
      0.417820
      0.085640
      0.137543
      0.080450
      ...
      0.000000
      0.000000
      0.312284
      0.087370
      0.000000
      0.026817
      0.0
      0.0
      0.000000
      0.071799
    
    
      1980
      0.368069
      0.884321
      0.121415
      0.503824
      0.038241
      0.334608
      0.219885
      0.212237
      0.000000
      0.051625
      ...
      0.000000
      0.000000
      0.531549
      0.176864
      0.021989
      0.014340
      0.0
      1.0
      0.000000
      0.403442
    
    
      1990
      0.513269
      0.335404
      0.099379
      0.197064
      0.013552
      0.169396
      0.219085
      0.051948
      0.013552
      0.040655
      ...
      0.188594
      0.058159
      0.364201
      0.214003
      0.000000
      0.236589
      0.0
      0.0
      0.000000
      0.253529
    
    
      2000
      0.316197
      0.733849
      0.103276
      0.340764
      0.004550
      0.195177
      0.106460
      0.016379
      0.000000
      0.182439
      ...
      0.008189
      0.000000
      0.266606
      0.082803
      0.000000
      0.179709
      0.0
      0.0
      0.018198
      0.237489
    
    
      2010
      0.131466
      1.000000
      0.196839
      0.046695
      0.008621
      0.173132
      0.000000
      0.909483
      0.000000
      0.044540
      ...
      0.010776
      0.000000
      0.554598
      0.054598
      0.043103
      0.100575
      0.0
      0.0
      0.000000
      0.754310
    
  

6 rows × 88 columns



In [163]:

    
diaDF.columns









    Out[163]:





RangeIndex(start=0, stop=88, step=1, name='chapter')



In [190]:

    
diaDF['decade'] = diaDF.index



In [193]:

    
diaMelted = diaDF.melt(id_vars='decade')



In [200]:

    
alt.Chart(diaMelted).mark_rect().encode(x='chapter:O', y='decade:O', color='value').properties(width=1000, height=300)









    Out[200]:



In [202]:

    
booksDiaDF = diachronicAnalysis(df, decades=(1960, 2020), bins=bookLocations).sort_index()
booksDiaDF



In [204]:

    
booksDiaDF['decade'] = booksDiaDF.index



In [211]:

    
booksMelted = booksDiaDF.melt(id_vars='decade', var_name='book')



In [212]:

    
booksMelted.head()



In [218]:

    
alt.Chart(booksMelted).mark_rect().encode(x='book:O', y='decade:O', color='value').properties(width=500, height=300)









    Out[218]:



In [112]:

    
def plotDiachronicAnalysisBooks(df, save=False, reverse=False): 
    ylabels = [str(int(decade)) for decade in df.index] + ['2020']
    plt.pcolor(df, cmap='gnuplot')
    plt.yticks(np.arange(len(df.index)+1), ylabels)
    plt.gca().invert_yaxis()
    plt.ylabel('Decade')
    plt.xlabel('Book')
    plt.gca().set_xlim((0, len(df.T)))
    plt.colorbar(ticks=[])
    if save: 
        plt.savefig('diachronic.png', bboxinches='tight', dpi=300, transparent=True)
    plt.show()

plotDiachronicAnalysisBooks(diachronicAnalysis(df, decades=(1950, 2020), bins=bookLocations).sort_index())



In [44]:

    
# Export image for publication
# plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations), save=True)



In [45]:

    
# Get the normalized proportion of, say, Chapter 20 in 1950: 
diachronicAnalysis(df)[20][1950]









    Out[45]:





0.736392742796158

By (Guessed) Gender of Author



In [46]:

    
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-e29df6d60f8d> in <module>
----> 1 maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
      2 maleNames = [name.lower() for name in maleNames]
      3 femaleNames = [name.lower() for name in femaleNames]

NameError: name 'names' is not defined



In [47]:

    
def guessGender(name): 
    name = name.split()[0].lower() # Grab the first name. 
    if name in maleNames and name in femaleNames: 
        return 'A' #Ambiguous
    elif name in maleNames: 
        return 'M'
    elif name in femaleNames: 
        return 'F'
    else: 
        return 'U'

def averageGender(names): 
    if type(names) != list: 
        return 'U'
    genderGuesses = [guessGender(name) for name in names]
    stats = Counter(genderGuesses).most_common()
    if len(stats) == 1: 
        # Only one author. We can just use that's author's gender guess. 
        return stats[0][0]
    elif stats[0][1] == stats[1][1]: # There's a tie. 
        return 'A' # Ambiguous. 
    else: 
        return stats[0][0] # Return the most common gender.



In [48]:

    
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-48-31ef052ab8a3> in <module>
----> 1 df['gender'] = df['author'].apply(averageGender)
      2 dfF = df.loc[df['gender'] == 'F']
      3 dfM = df.loc[df['gender'] == 'M']

/usr/lib/python3.7/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   3589             else:
   3590                 values = self.astype(object).values
-> 3591                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   3592 
   3593         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-47-0dc0c39b77c1> in averageGender(names)
     13     if type(names) != list:
     14         return 'U'
---> 15     genderGuesses = [guessGender(name) for name in names]
     16     stats = Counter(genderGuesses).most_common()
     17     if len(stats) == 1:

<ipython-input-47-0dc0c39b77c1> in <listcomp>(.0)
     13     if type(names) != list:
     14         return 'U'
---> 15     genderGuesses = [guessGender(name) for name in names]
     16     stats = Counter(genderGuesses).most_common()
     17     if len(stats) == 1:

<ipython-input-47-0dc0c39b77c1> in guessGender(name)
      1 def guessGender(name):
      2     name = name.split()[0].lower() # Grab the first name.
----> 3     if name in maleNames and name in femaleNames:
      4         return 'A' #Ambiguous
      5     elif name in maleNames:

NameError: name 'maleNames' is not defined



In [49]:

    
# Differences in citations between genders. 
plotSynchronicAnalysis(synchronicAnalysis(dfM) - synchronicAnalysis(dfF))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-49-96c2a130d76b> in <module>
      1 # Differences in citations between genders.
----> 2 plotSynchronicAnalysis(synchronicAnalysis(dfM) - synchronicAnalysis(dfF))

NameError: name 'dfM' is not defined

By (Guessed) Country of Publication



In [35]:

    
def getFirst(row): 
    if type(row) == list: 
        return row[0]
    else: 
        return row

topPublishers = df['publisher_name'].apply(getFirst).value_counts()



In [36]:

    
publishers = topPublishers[:80].index



In [37]:

    
publishers = publishers.tolist()



In [38]:

    
def getCountry(publisher): 
    brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
             'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
             'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
    canadians = ['Victorian Studies Association of Western Canada'] 
    if type(publisher) != list: 
        return 'Unknown'
    publisher = publisher[0]
    if publisher in brits: 
        return 'Britain' 
    elif publisher in canadians or 'Canada' in publisher: 
        return 'Canada' 
    elif 'GmbH' in publisher: 
        return 'Germany'
    elif 'estudios' in publisher: 
        return 'Spain'
    elif 'France' in publisher: 
        return 'France' 
    elif 'Ireland' in publisher: 
        return 'Ireland'
    else: 
        return 'US'



In [39]:

    
df['country'] = df['publisher_name'].apply(getCountry)



In [40]:

    
df['country'].value_counts()









    Out[40]:





US         3901
Unknown    1247
Britain     825
Canada       59
Germany      15
Spain         8
Ireland       8
France        6
Name: country, dtype: int64



In [41]:

    
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']



In [42]:

    
# Since British authors are greatly outnumbered in this corpus, we should normalize the data. 
britsHist = synchronicAnalysis(dfBrits) 
normBrits = britsHist.div(britsHist.max())
yanksHist = synchronicAnalysis(dfYanks)
normYanks = yanksHist.div(yanksHist.max())



In [43]:

    
plotSynchronicAnalysis(normYanks - normBrits)

By Journal



In [44]:

    
# Look at the top journals. 
journalStats = df['journal'].value_counts()
journalStats[:10]









    Out[44]:





Victorian Studies                            424
George Eliot - George Henry Lewes Studies    206
Nineteenth-Century Fiction                   192
The Modern Language Review                   188
The Review of English Studies                185
NOVEL: A Forum on Fiction                    126
Nineteenth-Century Literature                126
Studies in the Novel                         120
Studies in English Literature, 1500-1900      85
ELH                                           77
Name: journal, dtype: int64



In [45]:

    
journalList = journalStats.index

Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.



In [46]:

    
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']



In [47]:

    
# Normalize
geDF = synchronicAnalysis(geJournals)
otherDF = synchronicAnalysis(otherJournals)
normGE = geDF.div(geDF.max())
normOther = otherDF.div(otherDF.max())



In [64]:

    
fig = plt.figure()
ax = (normGE - normOther).plot(kind='bar')
fig.add_subplot(ax)
ax.set_xlabel('Chapter')
ax.set_ylabel('Specialization Index')
# Save a big version for publication. 
fig.savefig('specialization.png', bboxinches='tight', dpi=300)



In [56]:

    
journals = pd.DataFrame({title: synchronicAnalysis(df.loc[df['journal'] == title]) for title in journalList }).T



In [57]:

    
cutoff = 1500
topJournals = journals.loc[journals.sum(axis=1) > cutoff]
otherJournals = journals.loc[journals.sum(axis=1) < cutoff]
topJournals.loc['Other'] = otherJournals.sum()









    



/home/jon/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.



In [58]:

    
topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')









    Out[58]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f077d2b1630>



In [63]:

    
ax = topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')
fig = ax.get_figure()
fig.savefig('synchronic-journals.png', bboxinches='tight', dpi=300)

Detour: Ch. 15



In [49]:

    
# Try to find out why Ch. 15 was so big in the 80s and 90s. 
chap15s = []
ids = []
for i, row in df.iterrows(): 
    locations = row['Locations in A']
    starts = [item[0] for item in locations]
    if row['Decade'] in [1980, 1990]: 
        for start in starts: 
            if start > 290371 and start < 322051: # Does it cite Chapter XV? 
                if row.id not in ids: 
                    chap15s.append(row)
                    ids.append(row.id)



In [50]:

    
# Get the titles of those articles. 
[item.title for item in chap15s]









    Out[50]:





[['"Wrinkled Deep in Time": The Alexandria Quartet as Many-Layered Palimpsest'],
 ["The Victorian Discourse of Gambling: Speculations on Middlemarch and the Duke's Children"],
 ['George Eliot\'s Scrupulous Research: The Facts behind Eliot\'s Use of the "Keepsake in Middlemarch"'],
 ['ERZÄHLERISCHE OBJEKTIVITÄT, ,AUTHORIAL INTRUSIONS‘ UND ENGLISCHER REALISMUS'],
 ['The Union of "Miss Brooke" and "Middlemarch": A Study of the Manuscript'],
 ["The Turn of George Eliot's Realism"],
 ['Transformation of Rage',
  "Mourning and Creativity in George Eliot's Fiction",
  'The Vast Wreck of Ambitious Ideals in Middlemarch'],
 ['SILENCE, GESTURE, AND MEANING IN "MIDDLEMARCH"'],
 ['Steamboat Surfacing: Scott and the English Novelists'],
 ['Heroic Commitment in Richardson, Eliot, and James',
  'POWER AS PARTIALITY IN MIDDLEMARCH'],
 ['AN END TO CONVERTING PATIENTS\' STOMACHS INTO DRUG-SHOPS: LYDGATE\'S NEW METHOD OF CHARGING HIS PATIENTS IN "MIDDLEMARCH"'],
 ['Dangerous Crossings: Dickens, Digression, and Montage'],
 ['Professional Judgment and the Rationing of Medical Care'],
 ['Illuminating the Vision of Ordinary Life: A Tribute to "Middlemarch"'],
 ['NARRATIVE VOICE AND THE "FEMININE" NOVELIST: DINAH MULOCK AND GEORGE ELIOT'],
 ['Versions of Narrative: Overt and Covert Narrators in Nineteenth Century Historiography'],
 ['"Middlemarch" and George Eliot\'s Female (Re) Vision of Shakespeare'],
 ['Vital Signs',
  'Medical Realism in Nineteenth-Century Fiction',
  '“A NEW ORGAN OF KNOWLEDGE”:',
  'MEDICAL ORGANICISM AND THE LIMITS OF REALISM IN MIDDLEMARCH'],
 ['Heroism and Organicism in the Case of Lydgate'],
 ['The Language of Discovery: William Whewell and George Eliot'],
 ['THE DIALOGIC UNIVERSE OF "MIDDLEMARCH"'],
 ['Lamarque and Olsen on Literature and Truth'],
 ['Middlemarch, Realism and the Birth of the Clinic'],
 ['Microscopy and Semiotic in Middlemarch'],
 ["George Eliot's Reflexive Text: Three Tonalities in the Narrative Voice of Middlemarch"],
 ['The Strange Case of Monomania: Patriarchy in Literature, Murder in Middlemarch, Drowning in Daniel Deronda'],
 ['George Eliot and the Eighteenth-Century Novel'],
 ['Metaphors of Mind in Fiction and Psychology',
  'TOWARD THE LIFE OF THE MIND:',
  'JAMES AND ELIOT DISCOVER SENTIENCE']]



In [51]:

    
ch15Topics =  [item.topics for item in chap15s]
chap15TopicsFlat = [item for sublist in ch15Topics for item in sublist]
Counter(chap15TopicsFlat).most_common(20)









    Out[51]:





[('Love', 8),
 ('Vocation', 8),
 ('Novelists', 8),
 ('Pity', 8),
 ('Gossip', 6),
 ('Sympathy', 6),
 ('Irony', 6),
 ('Narratology', 5),
 ('Novels', 5),
 ('Gambling', 4),
 ('Marriage ceremonies', 4),
 ('Vanity', 4),
 ('Antitheses', 4),
 ('Heroism', 4),
 ('Pathos', 4),
 ('Melodrama', 4),
 ('Humor', 4),
 ('Asceticism', 3),
 ('Aristocracy', 3),
 ('Pickles', 3)]

Chapter 20



In [ ]:

    
Chapter 20 Detour



In [26]:

    
# Try to find out what articles cited chapter 20 
chap20s = []
ids = []
for i, row in df.iterrows(): 
    locations = row['Locations in A']
    starts = [item[0] for item in locations]
    if row['Decade'] in [1870, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]: 
        for start in starts: 
            if start > 406324 and start < 432778: # Does it cite Chapter XX? 
                if row.id not in ids: 
                    chap20s.append(row)
                    ids.append(row.id)



In [27]:

    
# Get the titles of those articles. 
[item.title for item in chap20s]









    Out[27]:





[['Dorothea and "Miss Brooke" in Middlemarch'],
 ['"Myriad-Headed, Myriad-Handed": Labor in "Middlemarch"'],
 ["The Squirrel's Heartbeat: Some Thoughts on the Later Style of Henry James"],
 ['This Side of Silence',
  'Human Rights, Torture, and the Recognition of Cruelty',
  'Front Matter'],
 ['DOROTHEA BROOKE UND EROTISCHE KUNST IN GEORGE ELIOTS "MIDDLEMARCH"'],
 ['Social Figures',
  'George Eliot, Social History, and Literary Representation',
  'Imperfection and Compensation'],
 ['George Eliot and Greek Tragedy'],
 ['Marriage in Literature'],
 ['Women and the Art of Fiction'],
 ['Little Dorrit and Dorothea Brooke: Interpreting the Heroines of History'],
 ['Women, Energy, and "Middlemarch"'],
 ['Inherited Emotions: George Eliot and the Politics of Heirlooms'],
 ['Transformation of Rage',
  "Mourning and Creativity in George Eliot's Fiction",
  'The Vast Wreck of Ambitious Ideals in Middlemarch'],
 ['Error and the Academic Self',
  'The Scholarly Imagination, Medieval to Modern',
  'MY CASAUBON:',
  'THE NOVEL OF SCHOLARSHIP AND VICTORIAN PHILOLOGY'],
 ["Dorothea's 'Resartus' and the Palingenetic Impulse of Middlemarch"],
 ['THE ECONOMIC PROBLEM OF SYMPATHY: PARABASIS, INTEREST, AND REALIST FORM IN "MIDDLEMARCH"'],
 ['Isabel, Gwendolen, and Dorothea'],
 ['Beside the Reclining Statue: Ekphrasis, Narrative, and Desire in Middlemarch'],
 ['The Metaphorical Imagination of George Eliot'],
 ["'The Sensitive Author': George Eliot"],
 ['George Eliot and Herbert Spencer',
  'Feminism, Evolutionism, and the Reconstruction of Gender',
  'Theories of Origin and Knowledge:',
  'Middlemarch and The Study of Sociology'],
 ['Why Read George Eliot? Her novels are just modern enough—and just old-fashioned enough, too'],
 ['Reading Style',
  'A Life in Sentences',
  'Lord Leighton, Liberace and the Advantages of Bad Writing:',
  'Helen DeWitt, Harry Stephen Keeler, Lionel Shriver, George Eliot'],
 ['Violette Nozière', 'A Story of Murder in 1930s Paris', 'Introduction'],
 ['The Greeks, the Germans, and George Eliot'],
 ["MARY SOMERVILLE'S INFLUENCE ON GEORGE ELIOT"],
 ['Why Ekphrasis?'],
 ["Isabel Archer's disease, and Henry James's"],
 ['Women, Love, and Power',
  'Literary and Psychoanalytic Perspectives',
  'The Feminine Bildungsroman:',
  'Education through Marriage'],
 ['Narrative and History'],
 ['An Image of Disenchantment in the Novels of George Eliot'],
 ['The Licensed Trespasser: The Omniscient Narrator in "Middlemarch"'],
 ['The Germ and the Picture in Middlemarch'],
 ['"Middlemarch," Obligation, and Dorothea\'s Duplicity'],
 ['Stupendous, Miserable City', 'Pasolini’s Rome', 'Notes'],
 ['Protocols of Reading', 'Criticism:', 'Rhetoric and Ethics'],
 ["Tolstoj's Reading of George Eliot: Visions and Revisions"],
 ['An Erotics of Detachment: "Middlemarch" and Novel-Reading as Critical Practice'],
 ['A Note on Literary Indebtedness: Dickens, George Eliot, Henry James'],
 ['Night Passages', 'Philosophy, Literature, and Film', 'GEORGE ELIOT’S DAWN'],
 ['The Language of Silence: A Citation'],
 ['Middlemarch: Narrative Unity in the Story of Dorothea Brooke'],
 ["Imagery in George Eliot's Last Novels"],
 ['Development and the Learning Organisation: An Introduction'],
 ["Eco's Echoes: Ironizing the (Post) Modern"],
 ['Torpedoes, tapirs and tortoises: scientific discourse in "Middlemarch"'],
 ['On Eloquence', 'Like Something Almost Being Said'],
 ['Moving Images',
  'Nineteenth-Century Reading and Screen Practices',
  'Introduction:',
  'Moving Images: Nineteenth-Century Reading and Screen Practices'],
 ['Acting in the Night',
  'Macbeth and the Places of the Civil War',
  'Sound and Fury:',
  'Nature in Virginia'],
 ['Metaphor', 'Not Quite against Metaphor'],
 ['Adventures among Ants',
  'A Global Safari with a Cast of Trillions',
  'conclusion:',
  'four ways of looking at an ant'],
 ['“The Continuity of Married Companionship”'],
 ['Victorianized Romans: Images of Rome in Victorian Painting'],
 ['Religious Humanism and the Victorian Novel: George Eliot, Walter Pater and Samuel Butler',
  'Middlemarch:',
  'The Balance of Progress'],
 ['A History of the Lights and Shadows: The Secret Motion of "Middlemarch"'],
 ['The Not-Quite Said'],
 ['Cutting and the Pedagogy of SelfDisclosure',
  'EPILOGUE',
  '“I Was Committing a Crime against My Body, against Women”'],
 ['The Novel as Ethical Paradigm?'],
 ["ELIOT'S SPANISH CONNECTION: CASAUBON, THE AVATAR OF QUIXOTE"],
 ['Literature as Conduct: Speech Acts in Henry James',
  'The Story of a Kiss:',
  'Isabel’s Decisions in The Portrait of a Lady'],
 ['Economic Woman',
  'Demand, Gender, and Narrative Closure in Eliot and Hardy',
  'Notes'],
 ["Under Conrad's Eyes",
  'The Novel as Criticism',
  'The Trouble with Sympathy in Middlemarch and Nostromo'],
 ['"MIDDLEMARCH": ELIOT\'S TENDER SUBVERSION'],
 ['Dying in Character',
  'Memoirs on the End of Life',
  '“I Never Saw or Heard the Car Coming”',
  'My Close Call with Death'],
 ['Deep History',
  'The Architecture of Past and Present',
  'Energy and Ecosystems'],
 ['Narrative in the Moral Theology of Tom Shaffer'],
 ["George Eliot's Vagueness"],
 ['Samuel Johnson and the Tragic Sense', 'Johnson’s Tragic Sense of Life'],
 ['THE REDEMPTIVE PAST IN THE NEO-VICTORIAN NOVEL'],
 ['Love, Amy',
  'The Selected Letters of Amy Clampitt',
  'The Letters of Amy Clampitt'],
 ['Middlemarch, Realism and the Birth of the Clinic'],
 ['Views from above and below: George Eliot and Fakir Mohan Senapati'],
 ['Sounds of Modern History',
  'Auditory Cultures in 19th- and 20th-Century Europe',
  'English Beat:',
  'The Stethoscopic Era’s Sonic Traces'],
 ['ROME IN "MIDDLEMARCH": A NEED FOR FOREIGNNESS'],
 ["LE IMMAGINI DELL'ACQUA NEL LINGUAGGIO DI GEORGE ELIOT"],
 ['George Eliot and the Eighteenth-Century Novel'],
 ['"Be Ye Lukewarm!": The Nineteenth-Century Novel and Social Action'],
 ['A Note on Middlemarch'],
 ['The Prison of Womanhood'],
 ['Victorian Honeymoons: Sexual Reorientations and the "Sights" of Europe'],
 ['When Narrative Fails'],
 ['Victorian Interpretation', 'George Eliot’s Hermeneutics of Sympathy']]



In [28]:

    
len(chap20s)









    Out[28]:





82



In [29]:

    
# Try to find out what articles cite paragraph 6 in Chapter 20
chap20par6s = []
ids = []
for i, row in df.iterrows(): 
    locations = row['Locations in A']
    starts = [item[0] for item in locations]
    if row['Decade'] in [1870, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]: 
        for start in starts: 
            if start > 411152 and start < 412177: # Does it cite Chapter XX? 
                if row.id not in ids: 
                    chap20par6s.append(row)
                    ids.append(row.id)



In [30]:

    
# Get the titles of those articles.
[item.title for item in chap20par6s]









    Out[30]:





[['Dorothea and "Miss Brooke" in Middlemarch'],
 ['"Myriad-Headed, Myriad-Handed": Labor in "Middlemarch"'],
 ["The Squirrel's Heartbeat: Some Thoughts on the Later Style of Henry James"],
 ['This Side of Silence',
  'Human Rights, Torture, and the Recognition of Cruelty',
  'Front Matter'],
 ['Social Figures',
  'George Eliot, Social History, and Literary Representation',
  'Imperfection and Compensation'],
 ['George Eliot and Greek Tragedy'],
 ['THE ECONOMIC PROBLEM OF SYMPATHY: PARABASIS, INTEREST, AND REALIST FORM IN "MIDDLEMARCH"'],
 ['Why Read George Eliot? Her novels are just modern enough—and just old-fashioned enough, too'],
 ['Violette Nozière', 'A Story of Murder in 1930s Paris', 'Introduction'],
 ["Tolstoj's Reading of George Eliot: Visions and Revisions"],
 ['The Language of Silence: A Citation'],
 ['Development and the Learning Organisation: An Introduction'],
 ['Torpedoes, tapirs and tortoises: scientific discourse in "Middlemarch"'],
 ['On Eloquence', 'Like Something Almost Being Said'],
 ['Acting in the Night',
  'Macbeth and the Places of the Civil War',
  'Sound and Fury:',
  'Nature in Virginia'],
 ['Metaphor', 'Not Quite against Metaphor'],
 ['Adventures among Ants',
  'A Global Safari with a Cast of Trillions',
  'conclusion:',
  'four ways of looking at an ant'],
 ['“The Continuity of Married Companionship”'],
 ['The Not-Quite Said'],
 ['Cutting and the Pedagogy of SelfDisclosure',
  'EPILOGUE',
  '“I Was Committing a Crime against My Body, against Women”'],
 ['Economic Woman',
  'Demand, Gender, and Narrative Closure in Eliot and Hardy',
  'Notes'],
 ["Under Conrad's Eyes",
  'The Novel as Criticism',
  'The Trouble with Sympathy in Middlemarch and Nostromo'],
 ['Dying in Character',
  'Memoirs on the End of Life',
  '“I Never Saw or Heard the Car Coming”',
  'My Close Call with Death'],
 ['Deep History',
  'The Architecture of Past and Present',
  'Energy and Ecosystems'],
 ['Samuel Johnson and the Tragic Sense', 'Johnson’s Tragic Sense of Life'],
 ['Love, Amy',
  'The Selected Letters of Amy Clampitt',
  'The Letters of Amy Clampitt'],
 ['Views from above and below: George Eliot and Fakir Mohan Senapati'],
 ['Sounds of Modern History',
  'Auditory Cultures in 19th- and 20th-Century Europe',
  'English Beat:',
  'The Stethoscopic Era’s Sonic Traces'],
 ['"Be Ye Lukewarm!": The Nineteenth-Century Novel and Social Action'],
 ['A Note on Middlemarch']]



In [32]:

    
len(chap20par6s) # The number of items citing paragraph 6 in chapter 20









    Out[32]:





30



In [31]:

    
xxStart, xxEnd = chapterLocations[20:22] # Chapter 20 Boundaries



In [32]:

    
print(mm[xxStart:xxStart+1000]) # Verify we have Ch. 20









    



CHAPTER XX.

    "A child forsaken, waking suddenly,
     Whose gaze afeard on all things round doth rove,
     And seeth only that it cannot see
     The meeting eyes of love."


Two hours later, Dorothea was seated in an inner room or boudoir of a
handsome apartment in the Via Sistina.

I am sorry to add that she was sobbing bitterly, with such abandonment
to this relief of an oppressed heart as a woman habitually controlled
by pride on her own account and thoughtfulness for others will
sometimes allow herself when she feels securely alone.  And Mr.
Casaubon was certain to remain away for some time at the Vatican.

Yet Dorothea had no distinctly shapen grievance that she could state
even to herself; and in the midst of her confused thought and passion,
the mental act that was struggling forth into clearness was a
self-accusing cry that her feeling of desolation was the fault of her
own spiritual poverty.  She had married the man of her choice, and with
the advantage over most girls t



In [34]:

    
xx = mm[xxStart:xxEnd]



In [39]:

    
xxParaLocations = [match.start() for match in re.finditer('\n\n+', mm)]
xxParaLocations = [x for x in xxParaLocations if (x > xxStart) and (x < xxEnd)]



In [45]:

    
mm[xxParaLocations[4]:xxParaLocations[5]]









    Out[45]:





'\n\nBut this stupendous fragmentariness heightened the dreamlike\nstrangeness of her bridal life.  Dorothea had now been five weeks in\nRome, and in the kindly mornings when autumn and winter seemed to go\nhand in hand like a happy aged couple one of whom would presently\nsurvive in chiller loneliness, she had driven about at first with Mr.\nCasaubon, but of late chiefly with Tantripp and their experienced\ncourier.  She had been led through the best galleries, had been taken\nto the chief points of view, had been shown the grandest ruins and the\nmost glorious churches, and she had ended by oftenest choosing to drive\nout to the Campagna where she could feel alone with the earth and sky,\naway-from the oppressive masquerade of ages, in which her own life too\nseemed to become a masque with enigmatical costumes.'



In [47]:

    
articlesWithMatches['Locations in A'].loc[0]









    Out[47]:





[[5809, 6218],
 [8751, 9046],
 [57013, 57100],
 [83868, 83999],
 [116900, 117594],
 [192301, 192441],
 [195148, 195661],
 [402604, 402726],
 [411725, 412177],
 [449403, 450049],
 [450145, 450244],
 [1575265, 1575374],
 [1576340, 1576437],
 [1648982, 1649704],
 [1688955, 1689089],
 [1689907, 1690307],
 [1708999, 1709342]]



In [48]:

    
def inXX(matches): 
    """ Determine if the article has a match in Ch. 20"""
    for match in matches: 
        if match[0] > xxStart and match[0] < xxEnd:
            return True
    return False



In [50]:

    
articlesWithMatches['Locations in A'].apply(inXX).head()









    Out[50]:





0        True
10      False
100     False
1000    False
1008    False
Name: Locations in A, dtype: bool



In [66]:

    
def paraIndicesIn20(matches, paraLocations=xxParaLocations): 
    """ Determine paragraph number (index) for match in Ch. 20. """
    paraIndices = []
    if inXX(matches): 
        paraBoundaries = list(zip(paraLocations, paraLocations[1:]))
        for match in matches: 
            for i, paraBoundary in enumerate(paraBoundaries): 
                if set(range(match[0], match[1])) & set(range(paraBoundary[0], paraBoundary[1])): # find the set intersection of the ranges of pairs
                    paraIndices.append(i)
                else: 
                    paraIndices.append(None)
    return paraIndices



In [67]:

    
len(set(range(8, 10)) & set(range(1, 9)))









    Out[67]:





1



In [70]:

    
articlesWithMatches['paraIndicesIn20'] = articlesWithMatches['Locations in A'].apply(paraIndicesIn20)









    



/usr/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



In [85]:

    
counters = list(articlesWithMatches['paraIndicesIn20'].apply(Counter))



In [88]:

    
grandTally = Counter()



In [89]:

    
for counter in counters: 
    grandTally += counter



In [93]:

    
del grandTally[None]



In [100]:

    
dict(grandTally)









    Out[100]:





{6: 32,
 5: 34,
 11: 6,
 25: 1,
 7: 6,
 17: 4,
 10: 17,
 16: 9,
 26: 5,
 33: 5,
 14: 1,
 18: 3,
 12: 1,
 15: 1,
 3: 3,
 4: 4,
 29: 2,
 9: 1,
 2: 1}



In [103]:

    
pd.Series(dict(grandTally)).sort_index().plot(kind='bar')









    Out[103]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3b1a290a20>



In [107]:

    
print(mm[xxParaLocations[5]:xxParaLocations[7]]) # What are paragraphs #5 and #6?









    




To those who have looked at Rome with the quickening power of a
knowledge which breathes a growing soul into all historic shapes, and
traces out the suppressed transitions which unite all contrasts, Rome
may still be the spiritual centre and interpreter of the world.  But
let them conceive one more historical contrast: the gigantic broken
revelations of that Imperial and Papal city thrust abruptly on the
notions of a girl who had been brought up in English and Swiss
Puritanism, fed on meagre Protestant histories and on art chiefly of
the hand-screen sort; a girl whose ardent nature turned all her small
allowance of knowledge into principles, fusing her actions into their
mould, and whose quick emotions gave the most abstract things the
quality of a pleasure or a pain; a girl who had lately become a wife,
and from the enthusiastic acceptance of untried duty found herself
plunged in tumultuous preoccupation with her personal lot.  The weight
of unintelligible Rome might lie easily on bright nymphs to whom it
formed a background for the brilliant picnic of Anglo-foreign society;
but Dorothea had no such defence against deep impressions.  Ruins and
basilicas, palaces and colossi, set in the midst of a sordid present,
where all that was living and warm-blooded seemed sunk in the deep
degeneracy of a superstition divorced from reverence; the dimmer but
yet eager Titanic life gazing and struggling on walls and ceilings; the
long vistas of white forms whose marble eyes seemed to hold the
monotonous light of an alien world: all this vast wreck of ambitious
ideals, sensuous and spiritual, mixed confusedly with the signs of
breathing forgetfulness and degradation, at first jarred her as with an
electric shock, and then urged themselves on her with that ache
belonging to a glut of confused ideas which check the flow of emotion.
Forms both pale and glowing took possession of her young sense, and
fixed themselves in her memory even when she was not thinking of them,
preparing strange associations which remained through her after-years.
Our moods are apt to bring with them images which succeed each other
like the magic-lantern pictures of a doze; and in certain states of
dull forlornness Dorothea all her life continued to see the vastness of
St. Peter's, the huge bronze canopy, the excited intention in the
attitudes and garments of the prophets and evangelists in the mosaics
above, and the red drapery which was being hung for Christmas spreading
itself everywhere like a disease of the retina.

Not that this inward amazement of Dorothea's was anything very
exceptional: many souls in their young nudity are tumbled out among
incongruities and left to "find their feet" among them, while their
elders go about their business.  Nor can I suppose that when Mrs.
Casaubon is discovered in a fit of weeping six weeks after her wedding,
the situation will be regarded as tragic.  Some discouragement, some
faintness of heart at the new real future which replaces the imaginary,
is not unusual, and we do not expect people to be deeply moved by what
is not unusual.  That element of tragedy which lies in the very fact of
frequency, has not yet wrought itself into the coarse emotion of
mankind; and perhaps our frames could hardly bear much of it.  If we
had a keen vision and feeling of all ordinary human life, it would be
like hearing the grass grow and the squirrel's heart beat, and we
should die of that roar which lies on the other side of silence.  As it
is, the quickest of us walk about well wadded with stupidity.



In [ ]:

	0	1	2	3	4	5	6	7	8
1960	0.023909	0.460570	1.000000	0.184983	0.309983	0.526007	0.044463	0.122903	0.524329
1970	0.055885	1.000000	0.933636	0.405868	0.354523	0.166259	0.207125	0.014670	0.503318
1980	0.102859	0.755009	1.000000	0.257013	0.284798	0.085760	0.308576	0.092439	0.867219
1990	0.167434	0.618346	1.000000	0.220851	0.530300	0.117517	0.082151	0.069442	0.538405
2000	0.130296	0.889389	1.000000	0.688601	0.348331	0.401200	0.384327	0.058680	0.463255
2010	0.045096	1.000000	0.851651	0.590685	0.317891	0.235091	0.156727	0.117792	0.707491

chapter	0	1	2	3	4	5	6	7	8	9	...	78	79	80	81	82	83	84	85	86	87
decade
1960	0.054234	0.098953	0.197907	0.064700	0.000000	0.000000	0.000000	0.081827	0.000000	0.254995	...	0.142721	0.039962	0.166508	0.134158	0.000000	0.000000	0.0	0.0	0.000000	0.093245
1970	0.138408	0.346021	0.611592	0.252595	0.096886	0.074394	0.417820	0.085640	0.137543	0.080450	...	0.000000	0.000000	0.312284	0.087370	0.000000	0.026817	0.0	0.0	0.000000	0.071799
1980	0.368069	0.884321	0.121415	0.503824	0.038241	0.334608	0.219885	0.212237	0.000000	0.051625	...	0.000000	0.000000	0.531549	0.176864	0.021989	0.014340	0.0	1.0	0.000000	0.403442
1990	0.513269	0.335404	0.099379	0.197064	0.013552	0.169396	0.219085	0.051948	0.013552	0.040655	...	0.188594	0.058159	0.364201	0.214003	0.000000	0.236589	0.0	0.0	0.000000	0.253529
2000	0.316197	0.733849	0.103276	0.340764	0.004550	0.195177	0.106460	0.016379	0.000000	0.182439	...	0.008189	0.000000	0.266606	0.082803	0.000000	0.179709	0.0	0.0	0.018198	0.237489
2010	0.131466	1.000000	0.196839	0.046695	0.008621	0.173132	0.000000	0.909483	0.000000	0.044540	...	0.010776	0.000000	0.554598	0.054598	0.043103	0.100575	0.0	0.0	0.000000	0.754310