In [1]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
In [3]:
with open('../txt/e1a.json') as f:
rawData = f.read()
In [4]:
df = pd.read_json(rawData)
In [5]:
df['Decade'] = df['year'] - (df['year'] % 10)
In [6]:
df.head()
Out[6]:
In [7]:
df['year'].hist()
Out[7]:
In [8]:
textALength = 1793449
In [9]:
df['Locations in A'] = df['matches'].apply(lambda x: x[1])
In [14]:
def diachronicAnalysis(df, decades=(1950, 2020)):
decades = np.arange(decades[0], decades[1], 10)
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locations = row['Locations in A']
if decade not in decadeDict:
decadeDict[decade] = locations
else:
decadeDict[decade] += locations
# Grab the beginnings of quotes.
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
decadesBinned = {decade:
np.histogram(locations, bins=50, range=(0, textALength))[0]
for decade, locations in decadeStarts.items() if decade in decades}
decadesDF = pd.DataFrame(decadesBinned).T
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
return decadesDF
def plotDiachronicAnalysis(decadesDF):
ylabels = [str(int(decade)) for decade in decadesDF.index] + ['2020']
plt.pcolor(decadesDF, cmap='gnuplot')
plt.yticks(np.arange(len(decadesDF.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Novel Segment')
# plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
plt.colorbar(ticks=[])
plt.show()
def plotSynchronicAnalysis(decadesDF):
ax = decadesDF.sum().plot(kind='bar')
In [15]:
decadesDF = diachronicAnalysis(df)
plotDiachronicAnalysis(decadesDF)
In [27]:
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]
In [81]:
def guessGender(name):
name = name.split()[0].lower() # Grab the first name.
if name in maleNames and name in femaleNames:
return 'A' #Ambiguous
elif name in maleNames:
return 'M'
elif name in femaleNames:
return 'F'
else:
return 'U'
def averageGender(names):
if type(names) != list:
return 'U'
genderGuesses = [guessGender(name) for name in names]
stats = Counter(genderGuesses).most_common()
if len(stats) == 1:
# Only one author. We can just use that's author's gender guess.
return stats[0][0]
elif stats[0][1] == stats[1][1]: # There's a tie.
return 'A' # Ambiguous.
else:
return stats[0][0] # Return the most common gender.
In [82]:
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']
In [100]:
decadesDFM, decadesDFF = diachronicAnalysis(dfM), diachronicAnalysis(dfF)
In [110]:
# Differences in citations between genders.
decadesGenderDiff = decadesDFM - decadesDFF
plotSynchronicAnalysis(decadesGenderDiff)
In [128]:
def getFirst(row):
if type(row) == list:
return row[0]
else:
return row
topPublishers = df['publisher_name'].apply(getFirst).value_counts()
In [158]:
publishers = topPublishers[:80].index
In [159]:
publishers = publishers.tolist()
In [190]:
def getCountry(publisher):
brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
canadians = ['Victorian Studies Association of Western Canada']
if type(publisher) != list:
return 'Unknown'
publisher = publisher[0]
if publisher in brits:
return 'Britain'
elif publisher in canadians or 'Canada' in publisher:
return 'Canada'
elif 'GmbH' in publisher:
return 'Germany'
elif 'estudios' in publisher:
return 'Spain'
elif 'France' in publisher:
return 'France'
elif 'Ireland' in publisher:
return 'Ireland'
else:
return 'US'
In [193]:
df['country'] = df['publisher_name'].apply(getCountry)
In [195]:
df['country'].value_counts()
Out[195]:
In [200]:
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']
In [201]:
decadesDFBrits, decadesDFYanks = diachronicAnalysis(dfBrits), diachronicAnalysis(dfYanks)
In [204]:
plotSynchronicAnalysis(decadesDFYanks-decadesDFBrits)
In [213]:
# Look at the top journals.
df['journal'].value_counts()[:10]
Out[213]:
Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.
In [211]:
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']
In [228]:
ax = plotSynchronicAnalysis(diachronicAnalysis(geJournals) - diachronicAnalysis(otherJournals))
In [ ]: