In [35]:
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
In [36]:
with open('../middlemarch.txt') as f:
mm = f.read()
In [37]:
textALength = len(mm)
In [38]:
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk.
len(chapterLocations)
Out[38]:
In [39]:
with open('../txt/e2a.json') as f:
rawData = f.read()
In [40]:
df = pd.read_json(rawData)
In [7]:
df.columns
Out[7]:
In [8]:
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])
In [9]:
sum([len(item) for item in df['Locations in A'].values])
Out[9]:
In [10]:
def diachronicAnalysis(df, decades=(1950, 2020)):
decades = np.arange(decades[0], decades[1], 10)
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locations = row['Locations in A']
if decade not in decadeDict:
decadeDict[decade] = locations
else:
decadeDict[decade] += locations
# Grab the beginnings of quotes.
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}
decadesBinned = {decade:
np.histogram(locations, bins=chapterLocations, range=(0, textALength))[0]
for decade, locations in decadeStarts.items() if decade in decades}
decadesDF = pd.DataFrame(decadesBinned).T
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
return decadesDF
def synchronicAnalysis(df):
allLocations = []
for i, row in df.iterrows():
locations = row['Locations in A']
starts = [item[0] for item in locations]
for start in starts:
allLocations.append(start)
binned = np.histogram(allLocations, bins=chapterLocations, range=(0, textALength))
binnedDF = pd.Series(binned[0])
return binnedDF
def plotDiachronicAnalysis(df):
ylabels = [str(int(decade)) for decade in df.index] + ['2020']
plt.pcolor(df, cmap='gnuplot')
plt.yticks(np.arange(len(df.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Chapter')
# plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
plt.gca().set_xlim((0, len(df.T)))
plt.colorbar(ticks=[])
plt.show()
def plotSynchronicAnalysis(s):
ax = s.plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Number of Quotations')
In [11]:
plotSynchronicAnalysis(synchronicAnalysis(df))
In [12]:
sa = synchronicAnalysis(df)
In [13]:
grouped = pd.groupby(sa, by=pd.cut(sa.index, 4))
ax = grouped.sum().plot(kind='bar')
ax.set_xlabel('Chapter Ranges')
ax.set_ylabel('Number of Quotations')
Out[13]:
In [14]:
plotDiachronicAnalysis(diachronicAnalysis(df))
In [15]:
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]
In [16]:
def guessGender(name):
name = name.split()[0].lower() # Grab the first name.
if name in maleNames and name in femaleNames:
return 'A' #Ambiguous
elif name in maleNames:
return 'M'
elif name in femaleNames:
return 'F'
else:
return 'U'
def averageGender(names):
if type(names) != list:
return 'U'
genderGuesses = [guessGender(name) for name in names]
stats = Counter(genderGuesses).most_common()
if len(stats) == 1:
# Only one author. We can just use that's author's gender guess.
return stats[0][0]
elif stats[0][1] == stats[1][1]: # There's a tie.
return 'A' # Ambiguous.
else:
return stats[0][0] # Return the most common gender.
In [17]:
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']
In [18]:
# Differences in citations between genders.
plotSynchronicAnalysis(synchronicAnalysis(dfM) - synchronicAnalysis(dfF))
In [19]:
def getFirst(row):
if type(row) == list:
return row[0]
else:
return row
topPublishers = df['publisher_name'].apply(getFirst).value_counts()
In [20]:
publishers = topPublishers[:80].index
In [21]:
publishers = publishers.tolist()
In [22]:
def getCountry(publisher):
brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
canadians = ['Victorian Studies Association of Western Canada']
if type(publisher) != list:
return 'Unknown'
publisher = publisher[0]
if publisher in brits:
return 'Britain'
elif publisher in canadians or 'Canada' in publisher:
return 'Canada'
elif 'GmbH' in publisher:
return 'Germany'
elif 'estudios' in publisher:
return 'Spain'
elif 'France' in publisher:
return 'France'
elif 'Ireland' in publisher:
return 'Ireland'
else:
return 'US'
In [23]:
df['country'] = df['publisher_name'].apply(getCountry)
In [24]:
df['country'].value_counts()
Out[24]:
In [25]:
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']
In [26]:
# Since British authors are greatly outnumbered in this corpus, we should normalize the data.
britsHist = synchronicAnalysis(dfBrits)
normBrits = britsHist.div(britsHist.max())
yanksHist = synchronicAnalysis(dfYanks)
normYanks = yanksHist.div(yanksHist.max())
In [27]:
plotSynchronicAnalysis(normYanks - normBrits)
In [28]:
# Look at the top journals.
df['journal'].value_counts()[:10]
Out[28]:
Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.
In [29]:
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']
In [30]:
# Normalize
geDF = synchronicAnalysis(geJournals)
otherDF = synchronicAnalysis(otherJournals)
normGE = geDF.div(geDF.max())
normOther = otherDF.div(otherDF.max())
In [31]:
ax = (normGE - normOther).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Specialization Index')
Out[31]:
In [32]:
# Try to find out why Ch. 15 was so big in the 80s and 90s.
chap15s = []
ids = []
for i, row in df.iterrows():
locations = row['Locations in A']
starts = [item[0] for item in locations]
if row['Decade'] in [1980, 1990]:
for start in starts:
if start > 290371 and start < 322051: # Does it cite Chapter XV?
if row.id not in ids:
chap15s.append(row)
ids.append(row.id)
In [33]:
# Get the titles of those articles.
[item.title for item in chap15s]
Out[33]:
In [34]:
ch15Topics = [item.topics for item in chap15s]
chap15TopicsFlat = [item for sublist in ch15Topics for item in sublist]
Counter(chap15TopicsFlat).most_common(20)
Out[34]:
In [ ]:
In [ ]: