Experiment 1-A

This experiment used the full corpus of 6K+ texts scraped from JSTOR.


In [1]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]

In [3]:
with open('../txt/e1a.json') as f: 
    rawData = f.read()

In [4]:
df = pd.read_json(rawData)

In [5]:
df['Decade'] = df['year'] - (df['year'] % 10)

In [6]:
df.head()


Out[6]:
author coverdate disc_name doi id jcode journal la matches no ... pages publisher_name sp srcHtml title topics ty vo year Decade
0 [Harriet Farwell Adams] [19840601] [Language & Literature, Humanities] 10.2307/3044822 c6e6ce20-79c4-3c59-af91-b06c3208b37b [ninecentfict] Nineteenth-Century Fiction [eng] [23, [[5809, 6218], [8751, 8851], [8890, 9046]... [1] ... 69-90 [University of California Press] 69 <cite>Nineteenth-Century Fiction</cite>, Vol. ... [Dorothea and "Miss Brooke" in Middlemarch] [Sentiment, Fear, Martyrdom, Envy, Vocation, G... fla [39] 1984 1980
1 [HUGH WITEMEYER] [19910901] [Language & Literature, Humanities] 10.2307/43470798 0d7eb58a-e4c1-326b-a195-012da1a4eb11 [georelioghlnews] The George Eliot, George Henry Lewes Newsletter [eng] [0, [], []] [18/19] ... 73-78 [Penn State University Press] 73 <cite>The George Eliot, George Henry Lewes New... NaN [Lecture methods, Feminism, Pedagogy, Novelist... brv NaN 1991 1990
2 [Alison Cree, Louis J. Guillette, <suffix>Jr.<... [19950601] [Biological Sciences, Science and Mathematics,... 10.2307/1564553 f7384b7a-36be-3f0f-ac0b-b66455da0d36 [jherpetology] NaN [eng] [0, [], []] [2] ... 163-173 [Society for the Study of Amphibians and Repti... 163 <cite>Journal of Herpetology</cite>, Vol. 29, ... [Biennial Reproduction with a Fourteen-Month P... [Animal vivipary, Parturition, Fat body, Skink... fla [29] 1995 1990
3 [Calvin Bedient] [19690401] [Language & Literature, Humanities] 10.2307/3849222 ba20b1ad-b273-3608-bc9b-17b13f6d4e68 [hudsonreview] The Hudson Review [eng] [2, [[1820, 1922], [1787715, 1787883]], [[2324... [1] ... 70-84 [Hudson Review, Inc] 70 <cite>The Hudson Review</cite>, Vol. 22, No. 1... [Middlemarch: Touching Down] [Immortality, Asceticism, Sentimentality, Mete... fla [22] 1969 1960
4 [Jane S. Smith] [19770701] [Language & Literature, Humanities] 10.2307/40754482 ef12c01b-c42d-39b8-84dd-d00e6538143c [texastudlitelang] Texas Studies in Literature and Language [eng] [5, [[397716, 398239], [1588815, 1589172], [15... [2] ... 188-203 [University of Texas Press] 188 <cite>Texas Studies in Literature and Language... [The Reader as Part of the Fiction: Middlemarch] [Novels, Flattery, Humor, Vanity, Meditation, ... fla [19] 1977 1970

5 rows × 21 columns


In [7]:
df['year'].hist()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcee449048>

In [8]:
textALength = 1793449

In [9]:
df['Locations in A'] = df['matches'].apply(lambda x: x[1])

In [14]:
def diachronicAnalysis(df, decades=(1950, 2020)): 
    decades = np.arange(decades[0], decades[1], 10)
    # Make a dictionary of decades. 
    # Values are a list of locations.  
    decadeDict = {}
    for i, row in df.iterrows():
        decade = row['Decade']
        locations = row['Locations in A']
        if decade not in decadeDict: 
            decadeDict[decade] = locations
        else: 
            decadeDict[decade] += locations 
    # Grab the beginnings of quotes. 
    decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
    decadesBinned = {decade: 
                 np.histogram(locations, bins=50, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}
    decadesDF = pd.DataFrame(decadesBinned).T
    #Normalize
    decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
    return decadesDF

def plotDiachronicAnalysis(decadesDF): 
    ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']
    plt.pcolor(decadesDF, cmap='gnuplot')
    plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
    plt.gca().invert_yaxis()
    plt.ylabel('Decade')
    plt.xlabel('Novel Segment')
#     plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
    plt.colorbar(ticks=[])
    plt.show()
    
def plotSynchronicAnalysis(decadesDF): 
    ax = decadesDF.sum().plot(kind='bar')

In [15]:
decadesDF = diachronicAnalysis(df)
plotDiachronicAnalysis(decadesDF)


By (Guessed) Gender of Author


In [27]:
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]

In [81]:
def guessGender(name): 
    name = name.split()[0].lower() # Grab the first name. 
    if name in maleNames and name in femaleNames: 
        return 'A' #Ambiguous
    elif name in maleNames: 
        return 'M'
    elif name in femaleNames: 
        return 'F'
    else: 
        return 'U'

def averageGender(names): 
    if type(names) != list: 
        return 'U'
    genderGuesses = [guessGender(name) for name in names]
    stats = Counter(genderGuesses).most_common()
    if len(stats) == 1: 
        # Only one author. We can just use that's author's gender guess. 
        return stats[0][0]
    elif stats[0][1] == stats[1][1]: # There's a tie. 
        return 'A' # Ambiguous. 
    else: 
        return stats[0][0] # Return the most common gender.

In [82]:
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']

In [100]:
decadesDFM, decadesDFF = diachronicAnalysis(dfM), diachronicAnalysis(dfF)

In [110]:
# Differences in citations between genders. 
decadesGenderDiff = decadesDFM - decadesDFF
plotSynchronicAnalysis(decadesGenderDiff)


By (Guessed) Country of Publication


In [128]:
def getFirst(row): 
    if type(row) == list: 
        return row[0]
    else: 
        return row

topPublishers = df['publisher_name'].apply(getFirst).value_counts()

In [158]:
publishers = topPublishers[:80].index

In [159]:
publishers = publishers.tolist()

In [190]:
def getCountry(publisher): 
    brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
             'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
             'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
    canadians = ['Victorian Studies Association of Western Canada'] 
    if type(publisher) != list: 
        return 'Unknown'
    publisher = publisher[0]
    if publisher in brits: 
        return 'Britain' 
    elif publisher in canadians or 'Canada' in publisher: 
        return 'Canada' 
    elif 'GmbH' in publisher: 
        return 'Germany'
    elif 'estudios' in publisher: 
        return 'Spain'
    elif 'France' in publisher: 
        return 'France' 
    elif 'Ireland' in publisher: 
        return 'Ireland'
    else: 
        return 'US'

In [193]:
df['country'] = df['publisher_name'].apply(getCountry)

In [195]:
df['country'].value_counts()


Out[195]:
US         3901
Unknown    1247
Britain     825
Canada       59
Germany      15
Ireland       8
Spain         8
France        6
Name: country, dtype: int64

In [200]:
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']

In [201]:
decadesDFBrits, decadesDFYanks = diachronicAnalysis(dfBrits), diachronicAnalysis(dfYanks)

In [204]:
plotSynchronicAnalysis(decadesDFYanks-decadesDFBrits)


By Journal


In [213]:
# Look at the top journals. 
df['journal'].value_counts()[:10]


Out[213]:
Victorian Studies                            424
George Eliot - George Henry Lewes Studies    206
Nineteenth-Century Fiction                   192
The Modern Language Review                   188
The Review of English Studies                185
NOVEL: A Forum on Fiction                    126
Nineteenth-Century Literature                126
Studies in the Novel                         120
Studies in English Literature, 1500-1900      85
ELH                                           77
Name: journal, dtype: int64

Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.


In [211]:
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']

In [228]:
ax = plotSynchronicAnalysis(diachronicAnalysis(geJournals) - diachronicAnalysis(otherJournals))



In [ ]: