In [3]:
import pandas as pd
import numpy as np
#import spacy
import re
import json
import altair as alt
#from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
In [ ]:
with open('../middlemarch.txt') as f:
mm = f.read()
In [ ]:
textALength = len(mm)
In [ ]:
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk.
len(chapterLocations)
In [ ]:
# Get book locations
bookLocations = [match.start() for match in re.finditer('\nBOOK', mm)]
bookLocations = [0] + bookLocations + [textALength] # Add one to account for last chunk.
bookLocations
In [4]:
def getChapters(text):
chapters = []
for i, loc in enumerate(chapterLocations):
if i != len(chapterLocations)-1:
chapter = mm[loc:chapterLocations[i+1]]
chapters.append(chapter)
return chapters
In [5]:
chapters = getChapters(mm)
chapterLengths = [len(chapter.split()) for chapter in chapters]
chapterLengthsSeries = pd.Series(chapterLengths)
chapterLengthsSeries.plot(kind='bar', title='Chapter Lengths')
In [6]:
df = pd.read_json('../data/e3.json')
In [7]:
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])
In [8]:
sum([len(item) for item in df['Locations in A'].values])
Out[8]:
In [9]:
len(df) # Total articles with "Middlemarch" mentioned somewhere
Out[9]:
Find only those with non-trivial quotations from Middlemarch:
In [10]:
articlesWithMatches = df[df['Locations in A'].apply(lambda x: len(x) > 0)]
articlesWithMatches.year.describe()
Out[10]:
In [11]:
articlesWithMatches.Wordcounts.apply(len).head()
Out[11]:
In [12]:
# articlesWithMatches.to_json('../data/cleaned-matches.json')
In [13]:
alt.Chart(articlesWithMatches).mark_bar().encode(x='year:O', y='count()').properties(width=1000)
In [14]:
df.columns
Out[14]:
In [15]:
df[df['Quoted Words'] > 0]['disc_name'].value_counts().head()
Out[15]:
In [16]:
def isGarbage(itemTitle):
badTitles = ['front matter', 'back matter', 'table of contents', 'cover']
if itemTitle == None:
return False
for title in itemTitle:
for badTitle in badTitles:
if badTitle in title.lower():
return True
return False
In [17]:
len(df[df.title.apply(isGarbage)]) # How many garbage items?
Out[17]:
In [18]:
df['Quoted Words'].describe()
Out[18]:
In [19]:
articlesWithMatches['Quoted Words'].describe()
Out[19]:
In [20]:
len(df[df['Quoted Words'] > 0])
Out[20]:
In [21]:
articlesWithMatches['Quoted Words'].hist()
In [22]:
articlesWithMatches['Wordcounts'].apply(np.mean).head()
Out[22]:
In [23]:
articlesWithMatches['Wordcounts'].apply(np.mean).describe()
Out[23]:
In [24]:
def diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations, useWordcounts=True, normalize=True):
""" Turning on useWordcounts makes it so that it's weighted by wordcount.
Turning it off uses raw numbers of quotations. """
decades = np.arange(decades[0], decades[1], 10)
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locationsAndWordcounts = row['Locations in A with Wordcounts']
if decade not in decadeDict:
decadeDict[decade] = locationsAndWordcounts.copy()
else:
decadeDict[decade] += locationsAndWordcounts.copy()
# Grab the beginnings of quotes.
decadeStartsWeights = {decade: [(item[0][0], item[1])
for item in loc]
for decade, loc in decadeDict.items()}
if useWordcounts:
decadesBinned = {decade:
np.histogram([loc[0] for loc in locations],
bins=bins,
weights=[loc[1] for loc in locations],
range=(0, textALength))[0]
for decade, locations in decadeStartsWeights.items()
if decade in decades}
else:
decadesBinned = {decade:
np.histogram([loc[0] for loc in locations],
bins=bins,
range=(0, textALength))[0]
for decade, locations in decadeStartsWeights.items()
if decade in decades}
decadesDF = pd.DataFrame(decadesBinned).T
#Normalize
if normalize:
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
return decadesDF
def countWords(locRange):
""" Counts words in middlemarch, given character ranges. """
chunk = mm[locRange[0]:locRange[1]]
return len(chunk.split())
def totalWords(locRangeSet):
""" Counts total words in a list of location ranges. """
return sum([countWords(locRange) for locRange in locRangeSet])
def countsPerSet(locRangeSet):
""" Returns an augmented location range set that includes word counts. """
return [(locRange, countWords(locRange))
for locRange in locRangeSet]
def extractWordcounts(locsAndWordcounts):
"""
Takes pairs of location ranges and wordcounts,
and returns just the wordcounts.
"""
return [item[1] for item in locsAndWordcounts
if len(locsAndWordcounts) > 0]
def synchronicAnalysis(df, bins=chapterLocations, useWordcounts=True):
locs = df['Locations in A'].values
locCounts = [(loc, countWords(loc)) for locSet in locs
for loc in locSet]
starts = [loc[0][0] for loc in locCounts]
counts = [loc[1] for loc in locCounts]
if useWordcounts:
binned = np.histogram(starts, bins=bins,
weights=counts, range=(0, textALength))
else:
binned = np.histogram(starts, bins=bins,
range=(0, textALength))
binnedDF = pd.Series(binned[0])
return binnedDF
def plotDiachronicAnalysis(df, save=False, reverse=False):
ylabels = [str(int(decade)) for decade in df.index] + ['2020']
plt.pcolor(df, cmap='gnuplot')
plt.yticks(np.arange(len(df.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Chapter')
plt.gca().set_xlim((0, len(df.T)))
plt.colorbar(ticks=[])
if save:
plt.savefig('diachronic.png', bboxinches='tight', dpi=300, transparent=True)
plt.show()
def plotSynchronicAnalysis(s, useWordcounts=True):
ax = s.plot(kind='bar')
ax.set_xlabel('Chapter')
if useWordcounts:
ax.set_ylabel('Number of Words Quoted')
else:
ax.set_ylabel('Number of Quotations')
In [25]:
df['Quoted Words'] = df['Locations in A'].apply(totalWords)
In [24]:
df['Locations in A with Wordcounts'] = df['Locations in A'].apply(countsPerSet)
In [25]:
# Verify that the diachronic wordcounts are the same as the synchronic wordcounts
decadeSums = diachronicAnalysis(df, decades=(1700, 2020), useWordcounts=True, normalize=False).sum(axis=1)
decadeSums.sum()
Out[25]:
In [26]:
chapterSums = synchronicAnalysis(df)
chapterSums.sum()
Out[26]:
In [27]:
df['Wordcounts'] = df['Locations in A with Wordcounts'].apply(extractWordcounts)
In [28]:
wordcounts = []
for countSet in df['Wordcounts'].values:
for count in countSet:
wordcounts.append(count)
In [29]:
pd.Series(wordcounts).hist()
Out[29]:
In [30]:
plotSynchronicAnalysis(synchronicAnalysis(df))
In [122]:
synchronicAnalysis(df, useWordcounts=True).to_csv('../papers/spring2017-middlemarch-paper/data/num-words-quoted-per-chapter.csv')
In [126]:
allMatches = []
for group in df['Locations in A'].values:
for pair in group:
allMatches.append(pair)
In [127]:
len(allMatches)
Out[127]:
In [21]:
plotSynchronicAnalysis(synchronicAnalysis(df, useWordcounts=False), useWordcounts=False)
In [82]:
quotationsPerBook = synchronicAnalysis(df, bins=bookLocations, useWordcounts=False)
quotationsPerBook
Out[82]:
In [100]:
quotationsPerBook = pd.DataFrame(quotationsPerBook, index=range(0,9), columns=['# Quotations'])
quotationsPerBook['Book'] = range(0, 9)
quotationsPerBook
Out[100]:
In [102]:
alt.Chart(quotationsPerBook).mark_bar().encode(x='Book:O', y='# Quotations:Q').properties(width=500)
Out[102]:
In [114]:
# Get the raw number of quotations per chapter
# synchronicAnalysis(df, useWordcounts=False).to_csv('../papers/spring2017-middlemarch-paper/data/num-quotations-per-chapter.csv')
In [113]:
# Adjusted for the number of words in each chapter
ax = (synchronicAnalysis(df) / chapterLengthsSeries).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Words Quoted, Normalized')
Out[113]:
In [107]:
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations).sort_index())
In [115]:
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1960, 2020), bins=chapterLocations).sort_index())
In [148]:
diaDF = diachronicAnalysis(df, decades=(1960, 2020), bins=chapterLocations).sort_index()
In [149]:
diaDF.columns.name = 'chapter'
diaDF.index.name = 'decade'
In [156]:
diaDF
Out[156]:
In [163]:
diaDF.columns
Out[163]:
In [190]:
diaDF['decade'] = diaDF.index
In [193]:
diaMelted = diaDF.melt(id_vars='decade')
In [200]:
alt.Chart(diaMelted).mark_rect().encode(x='chapter:O', y='decade:O', color='value').properties(width=1000, height=300)
Out[200]:
In [202]:
booksDiaDF = diachronicAnalysis(df, decades=(1960, 2020), bins=bookLocations).sort_index()
booksDiaDF
Out[202]:
In [204]:
booksDiaDF['decade'] = booksDiaDF.index
In [211]:
booksMelted = booksDiaDF.melt(id_vars='decade', var_name='book')
In [212]:
booksMelted.head()
Out[212]:
In [218]:
alt.Chart(booksMelted).mark_rect().encode(x='book:O', y='decade:O', color='value').properties(width=500, height=300)
Out[218]:
In [112]:
def plotDiachronicAnalysisBooks(df, save=False, reverse=False):
ylabels = [str(int(decade)) for decade in df.index] + ['2020']
plt.pcolor(df, cmap='gnuplot')
plt.yticks(np.arange(len(df.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Book')
plt.gca().set_xlim((0, len(df.T)))
plt.colorbar(ticks=[])
if save:
plt.savefig('diachronic.png', bboxinches='tight', dpi=300, transparent=True)
plt.show()
plotDiachronicAnalysisBooks(diachronicAnalysis(df, decades=(1950, 2020), bins=bookLocations).sort_index())
In [44]:
# Export image for publication
# plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations), save=True)
In [45]:
# Get the normalized proportion of, say, Chapter 20 in 1950:
diachronicAnalysis(df)[20][1950]
Out[45]:
In [46]:
maleNames, femaleNames = names.words('male.txt'), names.words('female.txt')
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]
In [47]:
def guessGender(name):
name = name.split()[0].lower() # Grab the first name.
if name in maleNames and name in femaleNames:
return 'A' #Ambiguous
elif name in maleNames:
return 'M'
elif name in femaleNames:
return 'F'
else:
return 'U'
def averageGender(names):
if type(names) != list:
return 'U'
genderGuesses = [guessGender(name) for name in names]
stats = Counter(genderGuesses).most_common()
if len(stats) == 1:
# Only one author. We can just use that's author's gender guess.
return stats[0][0]
elif stats[0][1] == stats[1][1]: # There's a tie.
return 'A' # Ambiguous.
else:
return stats[0][0] # Return the most common gender.
In [48]:
df['gender'] = df['author'].apply(averageGender)
dfF = df.loc[df['gender'] == 'F']
dfM = df.loc[df['gender'] == 'M']
In [49]:
# Differences in citations between genders.
plotSynchronicAnalysis(synchronicAnalysis(dfM) - synchronicAnalysis(dfF))
In [35]:
def getFirst(row):
if type(row) == list:
return row[0]
else:
return row
topPublishers = df['publisher_name'].apply(getFirst).value_counts()
In [36]:
publishers = topPublishers[:80].index
In [37]:
publishers = publishers.tolist()
In [38]:
def getCountry(publisher):
brits = ['Oxford University Press', 'Cambridge University Press', 'Modern Humanities Research Association', \
'BMJ', 'Taylor & Francis, Ltd.', 'Edinburgh University Press', \
'Royal Society for the Encouragement of Arts, Manufactures and Commerce']
canadians = ['Victorian Studies Association of Western Canada']
if type(publisher) != list:
return 'Unknown'
publisher = publisher[0]
if publisher in brits:
return 'Britain'
elif publisher in canadians or 'Canada' in publisher:
return 'Canada'
elif 'GmbH' in publisher:
return 'Germany'
elif 'estudios' in publisher:
return 'Spain'
elif 'France' in publisher:
return 'France'
elif 'Ireland' in publisher:
return 'Ireland'
else:
return 'US'
In [39]:
df['country'] = df['publisher_name'].apply(getCountry)
In [40]:
df['country'].value_counts()
Out[40]:
In [41]:
dfBrits = df.loc[df['country'] == 'Britain']
dfYanks = df.loc[df['country'] == 'US']
dfCanadians = df.loc[df['country'] == 'Canada']
In [42]:
# Since British authors are greatly outnumbered in this corpus, we should normalize the data.
britsHist = synchronicAnalysis(dfBrits)
normBrits = britsHist.div(britsHist.max())
yanksHist = synchronicAnalysis(dfYanks)
normYanks = yanksHist.div(yanksHist.max())
In [43]:
plotSynchronicAnalysis(normYanks - normBrits)
In [44]:
# Look at the top journals.
journalStats = df['journal'].value_counts()
journalStats[:10]
Out[44]:
In [45]:
journalList = journalStats.index
Compare the specialist journal, "George Eliot - George Henry Lewes Studies," with all other journals.
In [46]:
geJournals = df.loc[df['journal'] == 'George Eliot - George Henry Lewes Studies']
otherJournals = df.loc[df['journal'] != 'George Eliot - George Henry Lewes Studies']
In [47]:
# Normalize
geDF = synchronicAnalysis(geJournals)
otherDF = synchronicAnalysis(otherJournals)
normGE = geDF.div(geDF.max())
normOther = otherDF.div(otherDF.max())
In [64]:
fig = plt.figure()
ax = (normGE - normOther).plot(kind='bar')
fig.add_subplot(ax)
ax.set_xlabel('Chapter')
ax.set_ylabel('Specialization Index')
# Save a big version for publication.
fig.savefig('specialization.png', bboxinches='tight', dpi=300)
In [56]:
journals = pd.DataFrame({title: synchronicAnalysis(df.loc[df['journal'] == title]) for title in journalList }).T
In [57]:
cutoff = 1500
topJournals = journals.loc[journals.sum(axis=1) > cutoff]
otherJournals = journals.loc[journals.sum(axis=1) < cutoff]
topJournals.loc['Other'] = otherJournals.sum()
In [58]:
topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')
Out[58]:
In [63]:
ax = topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')
fig = ax.get_figure()
fig.savefig('synchronic-journals.png', bboxinches='tight', dpi=300)
In [49]:
# Try to find out why Ch. 15 was so big in the 80s and 90s.
chap15s = []
ids = []
for i, row in df.iterrows():
locations = row['Locations in A']
starts = [item[0] for item in locations]
if row['Decade'] in [1980, 1990]:
for start in starts:
if start > 290371 and start < 322051: # Does it cite Chapter XV?
if row.id not in ids:
chap15s.append(row)
ids.append(row.id)
In [50]:
# Get the titles of those articles.
[item.title for item in chap15s]
Out[50]:
In [51]:
ch15Topics = [item.topics for item in chap15s]
chap15TopicsFlat = [item for sublist in ch15Topics for item in sublist]
Counter(chap15TopicsFlat).most_common(20)
Out[51]:
In [ ]:
Chapter 20 Detour
In [26]:
# Try to find out what articles cited chapter 20
chap20s = []
ids = []
for i, row in df.iterrows():
locations = row['Locations in A']
starts = [item[0] for item in locations]
if row['Decade'] in [1870, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]:
for start in starts:
if start > 406324 and start < 432778: # Does it cite Chapter XX?
if row.id not in ids:
chap20s.append(row)
ids.append(row.id)
In [27]:
# Get the titles of those articles.
[item.title for item in chap20s]
Out[27]:
In [28]:
len(chap20s)
Out[28]:
In [29]:
# Try to find out what articles cite paragraph 6 in Chapter 20
chap20par6s = []
ids = []
for i, row in df.iterrows():
locations = row['Locations in A']
starts = [item[0] for item in locations]
if row['Decade'] in [1870, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]:
for start in starts:
if start > 411152 and start < 412177: # Does it cite Chapter XX?
if row.id not in ids:
chap20par6s.append(row)
ids.append(row.id)
In [30]:
# Get the titles of those articles.
[item.title for item in chap20par6s]
Out[30]:
In [32]:
len(chap20par6s) # The number of items citing paragraph 6 in chapter 20
Out[32]:
In [31]:
xxStart, xxEnd = chapterLocations[20:22] # Chapter 20 Boundaries
In [32]:
print(mm[xxStart:xxStart+1000]) # Verify we have Ch. 20
In [34]:
xx = mm[xxStart:xxEnd]
In [39]:
xxParaLocations = [match.start() for match in re.finditer('\n\n+', mm)]
xxParaLocations = [x for x in xxParaLocations if (x > xxStart) and (x < xxEnd)]
In [45]:
mm[xxParaLocations[4]:xxParaLocations[5]]
Out[45]:
In [47]:
articlesWithMatches['Locations in A'].loc[0]
Out[47]:
In [48]:
def inXX(matches):
""" Determine if the article has a match in Ch. 20"""
for match in matches:
if match[0] > xxStart and match[0] < xxEnd:
return True
return False
In [50]:
articlesWithMatches['Locations in A'].apply(inXX).head()
Out[50]:
In [66]:
def paraIndicesIn20(matches, paraLocations=xxParaLocations):
""" Determine paragraph number (index) for match in Ch. 20. """
paraIndices = []
if inXX(matches):
paraBoundaries = list(zip(paraLocations, paraLocations[1:]))
for match in matches:
for i, paraBoundary in enumerate(paraBoundaries):
if set(range(match[0], match[1])) & set(range(paraBoundary[0], paraBoundary[1])): # find the set intersection of the ranges of pairs
paraIndices.append(i)
else:
paraIndices.append(None)
return paraIndices
In [67]:
len(set(range(8, 10)) & set(range(1, 9)))
Out[67]:
In [70]:
articlesWithMatches['paraIndicesIn20'] = articlesWithMatches['Locations in A'].apply(paraIndicesIn20)
In [85]:
counters = list(articlesWithMatches['paraIndicesIn20'].apply(Counter))
In [88]:
grandTally = Counter()
In [89]:
for counter in counters:
grandTally += counter
In [93]:
del grandTally[None]
In [100]:
dict(grandTally)
Out[100]:
In [103]:
pd.Series(dict(grandTally)).sort_index().plot(kind='bar')
Out[103]:
In [107]:
print(mm[xxParaLocations[5]:xxParaLocations[7]]) # What are paragraphs #5 and #6?
In [ ]: