Experiment 2-B

This is just an analysis of the data generated in e2a.


In [35]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')

In [36]:
with open('../middlemarch.txt') as f: 
    mm = f.read()

In [37]:
textALength = len(mm)

In [38]:
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk. 
len(chapterLocations)


Out[38]:
89

In [39]:
with open('../txt/e2a.json') as f: 
    rawData = f.read()

In [40]:
df = pd.read_json(rawData)

In [7]:
df.columns


Out[7]:
Index(['Locations in A', 'Locations in B', 'author', 'coverdate', 'disc_name',
       'doi', 'id', 'jcode', 'journal', 'la', 'no', 'numMatches', 'ocr',
       'pages', 'publisher_name', 'sp', 'srcHtml', 'title', 'topics', 'ty',
       'vo', 'year'],
      dtype='object')

In [8]:
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])

In [9]:
sum([len(item) for item in df['Locations in A'].values])


Out[9]:
1794

In [10]:
def diachronicAnalysis(df, decades=(1950, 2020)): 
    decades = np.arange(decades[0], decades[1], 10)
    # Make a dictionary of decades. 
    # Values are a list of locations.  
    decadeDict = {}
    for i, row in df.iterrows():
        decade = row['Decade']
        locations = row['Locations in A']
        if decade not in decadeDict: 
            decadeDict[decade] = locations
        else: 
            decadeDict[decade] += locations 
    # Grab the beginnings of quotes. 
    decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
    decadesBinned = {decade: 
                 np.histogram(locations, bins=chapterLocations, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}
    decadesDF = pd.DataFrame(decadesBinned).T
    #Normalize
    decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
    return decadesDF

def synchronicAnalysis(df): 
    allLocations = []
    for i, row in df.iterrows(): 
        locations = row['Locations in A']
        starts = [item[0] for item in locations]
        for start in starts: 
            allLocations.append(start)
    binned = np.histogram(allLocations, bins=chapterLocations, range=(0, textALength))
    binnedDF = pd.Series(binned[0])
    return binnedDF

def plotDiachronicAnalysis(df): 
    ylabels = [str(int(decade)) for decade in df.index] + ['2020']
    plt.pcolor(df, cmap='gnuplot')
    plt.yticks(np.arange(len(df.index)+1), ylabels)
    plt.gca().invert_yaxis()
    plt.ylabel('Decade')
    plt.xlabel('Chapter')
#     plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
    plt.gca().set_xlim((0, len(df.T)))
    plt.colorbar(ticks=[])
    plt.show()
    
def plotSynchronicAnalysis(s): 
    ax = s.plot(kind='bar')
    ax.set_xlabel('Chapter')
    ax.set_ylabel('Number of Quotations')

In [11]:
plotSynchronicAnalysis(synchronicAnalysis(df))



In [12]:
sa = synchronicAnalysis(df)

In [13]:
grouped = pd.groupby(sa, by=pd.cut(sa.index, 4))
ax = grouped.sum().plot(kind='bar')
ax.set_xlabel('Chapter Ranges')
ax.set_ylabel('Number of Quotations')


Out[13]:
<matplotlib.text.Text at 0x7f6ba3085080>

In [14]:
plotDiachronicAnalysis(diachronicAnalysis(df))


By (Guessed) Gender of Author


In [15]:
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]

In [16]:
def guessGender(name): 
    name = name.split()[0].lower() # Grab the first name. 
    if name in maleNames and name in femaleNames: 
        return 'A' #Ambiguous
    elif name in maleNames: 
        return 'M'
    elif name in femaleNames: 
        return 'F'
    else: 
        return 'U'

def averageGender(names): 
    if type(names) != list: 
        return 'U'
    genderGuesses = [guessGender(name) for name in names]
    stats = Counter(genderGuesses).most_common()
    if len(stats) == 1: 
        # Only one author. We can just use that's author's gender guess. 
        return stats[0][0]
    elif stats[0][1] == stats[1][1]: # There's a tie. 
        return 'A' # Ambiguous. 
    else: 
        return stats[0][0] # Return the most common gender.

In [17]:
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']

In [18]:
# Differences in citations between genders. 
plotSynchronicAnalysis(synchronicAnalysis(dfM) - synchronicAnalysis(dfF))


By (Guessed) Country of Publication


In [19]:
def getFirst(row): 
    if type(row) == list: 
        return row[0]
    else: 
        return row

topPublishers = df['publisher_name'].apply(getFirst).value_counts()

In [20]:
publishers = topPublishers[:80].index

In [21]:
publishers = publishers.tolist()

In [22]:
def getCountry(publisher): 
    brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
             'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
             'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
    canadians = ['Victorian Studies Association of Western Canada'] 
    if type(publisher) != list: 
        return 'Unknown'
    publisher = publisher[0]
    if publisher in brits: 
        return 'Britain' 
    elif publisher in canadians or 'Canada' in publisher: 
        return 'Canada' 
    elif 'GmbH' in publisher: 
        return 'Germany'
    elif 'estudios' in publisher: 
        return 'Spain'
    elif 'France' in publisher: 
        return 'France' 
    elif 'Ireland' in publisher: 
        return 'Ireland'
    else: 
        return 'US'

In [23]:
df['country'] = df['publisher_name'].apply(getCountry)

In [24]:
df['country'].value_counts()


Out[24]:
US         3901
Unknown    1247
Britain     825
Canada       59
Germany      15
Spain         8
Ireland       8
France        6
Name: country, dtype: int64

In [25]:
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']

In [26]:
# Since British authors are greatly outnumbered in this corpus, we should normalize the data. 
britsHist = synchronicAnalysis(dfBrits) 
normBrits = britsHist.div(britsHist.max())
yanksHist = synchronicAnalysis(dfYanks)
normYanks = yanksHist.div(yanksHist.max())

In [27]:
plotSynchronicAnalysis(normYanks - normBrits)


By Journal


In [28]:
# Look at the top journals. 
df['journal'].value_counts()[:10]


Out[28]:
Victorian Studies                            424
George Eliot - George Henry Lewes Studies    206
Nineteenth-Century Fiction                   192
The Modern Language Review                   188
The Review of English Studies                185
Nineteenth-Century Literature                126
NOVEL: A Forum on Fiction                    126
Studies in the Novel                         120
Studies in English Literature, 1500-1900      85
ELH                                           77
Name: journal, dtype: int64

Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.


In [29]:
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']

In [30]:
# Normalize
geDF = synchronicAnalysis(geJournals)
otherDF = synchronicAnalysis(otherJournals)
normGE = geDF.div(geDF.max())
normOther = otherDF.div(otherDF.max())

In [31]:
ax = (normGE - normOther).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Specialization Index')


Out[31]:
<matplotlib.text.Text at 0x7f6b26dcb4e0>

Detour: Ch. 15


In [32]:
# Try to find out why Ch. 15 was so big in the 80s and 90s. 
chap15s = []
ids = []
for i, row in df.iterrows(): 
    locations = row['Locations in A']
    starts = [item[0] for item in locations]
    if row['Decade'] in [1980, 1990]: 
        for start in starts: 
            if start > 290371 and start < 322051: # Does it cite Chapter XV? 
                if row.id not in ids: 
                    chap15s.append(row)
                    ids.append(row.id)

In [33]:
# Get the titles of those articles. 
[item.title for item in chap15s]


Out[33]:
[['Dorothea and "Miss Brooke" in Middlemarch'],
 nan,
 ['Illuminating the Vision of Ordinary Life: A Tribute to "Middlemarch"'],
 ['"Middlemarch" and George Eliot\'s Female (Re) Vision of Shakespeare'],
 ['Heroism and Organicism in the Case of Lydgate'],
 ['THE DIALOGIC UNIVERSE OF "MIDDLEMARCH"'],
 ['Middlemarch, Realism and the Birth of the Clinic'],
 ['Microscopy and Semiotic in Middlemarch'],
 ["George Eliot's Reflexive Text: Three Tonalities in the Narrative Voice of Middlemarch"],
 ["The Victorian Discourse of Gambling: Speculations on Middlemarch and the Duke's Children"],
 ['George Eliot\'s Scrupulous Research: The Facts behind Eliot\'s Use of the "Keepsake in Middlemarch"'],
 ['The Union of "Miss Brooke" and "Middlemarch": A Study of the Manuscript'],
 ["The Turn of George Eliot's Realism"],
 ['Transformation of Rage',
  "Mourning and Creativity in George Eliot's Fiction",
  'The Vast Wreck of Ambitious Ideals in Middlemarch'],
 ['SILENCE, GESTURE, AND MEANING IN "MIDDLEMARCH"'],
 ['Heroic Commitment in Richardson, Eliot, and James',
  'POWER AS PARTIALITY IN MIDDLEMARCH'],
 ['AN END TO CONVERTING PATIENTS\' STOMACHS INTO DRUG-SHOPS: LYDGATE\'S NEW METHOD OF CHARGING HIS PATIENTS IN "MIDDLEMARCH"'],
 ['Dangerous Crossings: Dickens, Digression, and Montage'],
 ['Vital Signs',
  'Medical Realism in Nineteenth-Century Fiction',
  '“A NEW ORGAN OF KNOWLEDGE”:',
  'MEDICAL ORGANICISM AND THE LIMITS OF REALISM IN MIDDLEMARCH'],
 ['The Language of Discovery: William Whewell and George Eliot'],
 ['Lamarque and Olsen on Literature and Truth'],
 ['The Strange Case of Monomania: Patriarchy in Literature, Murder in Middlemarch, Drowning in Daniel Deronda'],
 ['George Eliot and the Eighteenth-Century Novel'],
 ['Metaphors of Mind in Fiction and Psychology',
  'TOWARD THE LIFE OF THE MIND:',
  'JAMES AND ELIOT DISCOVER SENTIENCE'],
 ['"Wrinkled Deep in Time": The Alexandria Quartet as Many-Layered Palimpsest'],
 ['ERZÄHLERISCHE OBJEKTIVITÄT, ,AUTHORIAL INTRUSIONS‘ UND ENGLISCHER REALISMUS'],
 ['Steamboat Surfacing: Scott and the English Novelists'],
 ['Professional Judgment and the Rationing of Medical Care'],
 ['NARRATIVE VOICE AND THE "FEMININE" NOVELIST: DINAH MULOCK AND GEORGE ELIOT'],
 ['Versions of Narrative: Overt and Covert Narrators in Nineteenth Century Historiography']]

In [34]:
ch15Topics =  [item.topics for item in chap15s]
chap15TopicsFlat = [item for sublist in ch15Topics for item in sublist]
Counter(chap15TopicsFlat).most_common(20)


Out[34]:
[('Vocation', 9),
 ('Love', 9),
 ('Novelists', 9),
 ('Irony', 8),
 ('Pity', 8),
 ('Sympathy', 7),
 ('Gossip', 6),
 ('Narratology', 5),
 ('Novels', 5),
 ('Fear', 4),
 ('Companionship', 4),
 ('Modesty', 4),
 ('Humor', 4),
 ('Heroism', 4),
 ('Vanity', 4),
 ('Marriage ceremonies', 4),
 ('Melodrama', 4),
 ('Gambling', 4),
 ('Antitheses', 4),
 ('Pathos', 4)]

In [ ]:


In [ ]: