In [2]:
import spacy
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
In [3]:
nlp = spacy.load('en')
In [4]:
with open('../middlemarch.txt') as f:
mm = f.read()
In [5]:
textALength = len(mm)
In [6]:
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk.
len(chapterLocations)
Out[6]:
In [7]:
def getChapters(text):
chapters = []
for i, loc in enumerate(chapterLocations):
if i != len(chapterLocations)-1:
chapter = mm[loc:chapterLocations[i+1]]
chapters.append(chapter)
return chapters
In [8]:
chapters = getChapters(mm)
chapterLengths = [len(nlp(chapter, tag=False, parse=False, entity=False)) for chapter in chapters]
chapterLengthsSeries = pd.Series(chapterLengths)
chapterLengthsSeries.plot(kind='bar', title='Chapter Lengths')
Out[8]:
In [9]:
with open('../txt/e4.json') as f:
rawData = f.read()
In [10]:
df = pd.read_json(rawData)
In [19]:
df
Out[19]:
In [150]:
df.publication_qualifier.value_counts()
Out[150]:
In [11]:
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])
In [12]:
sum([len(item) for item in df['Locations in A'].values])
Out[12]:
In [23]:
def diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations):
decades = np.arange(decades[0], decades[1], 10)
# Make a dictionary of decades.
# Values are a list of locations.
decadeDict = {}
for i, row in df.iterrows():
decade = row['Decade']
locationsAndWordcounts = row['Locations in A with Wordcounts']
if decade not in decadeDict:
decadeDict[decade] = locationsAndWordcounts
else:
decadeDict[decade] += locationsAndWordcounts
# Grab the beginnings of quotes.
decadeStartsWeights = {decade: [(item[0][0], item[1])
for item in loc]
for decade, loc in decadeDict.items()}
decadesBinned = {decade:
np.histogram([loc[0] for loc in locations],
bins=bins,
weights=[loc[1] for loc in locations],
range=(0, textALength))[0]
for decade, locations in decadeStartsWeights.items()
if decade in decades}
decadesDF = pd.DataFrame(decadesBinned).T
#Normalize
decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
return decadesDF
def countWords(locRange):
""" Counts words in middlemarch, given character ranges. """
doc = nlp(mm[locRange[0]:locRange[1]], tag=False, parse=False, entity=False)
return len(doc)
def totalWords(locRangeSet):
""" Counts total words in a list of location ranges. """
return sum([countWords(locRange) for locRange in locRangeSet])
def countsPerSet(locRangeSet):
""" Returns an augmented location range set that includes word counts. """
return [(locRange, countWords(locRange))
for locRange in locRangeSet]
def synchronicAnalysis(df, bins=chapterLocations):
locs = df['Locations in A'].values
locCounts = [(loc, countWords(loc)) for locSet in locs
for loc in locSet]
starts = [loc[0][0] for loc in locCounts]
counts = [loc[1] for loc in locCounts]
binned = np.histogram(starts, bins=bins,
weights=counts, range=(0, textALength))
binnedDF = pd.Series(binned[0])
return binnedDF
def plotDiachronicAnalysis(df):
ylabels = [str(int(decade)) for decade in df.index] #+ ['2020']
plt.pcolor(df, cmap='gnuplot')
plt.yticks(np.arange(len(df.index)+1), ylabels)
plt.gca().invert_yaxis()
plt.ylabel('Decade')
plt.xlabel('Chapter')
# plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
plt.gca().set_xlim((0, len(df.T)))
plt.colorbar(ticks=[])
plt.show()
def plotSynchronicAnalysis(s):
ax = s.plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Number of Words Quoted')
In [15]:
df['Quoted Words'] = df['Locations in A'].apply(totalWords)
In [16]:
df['Locations in A with Wordcounts'] = df['Locations in A'].apply(countsPerSet)
In [17]:
plotSynchronicAnalysis(synchronicAnalysis(df))
In [18]:
# Adjusted for the number of words in each chapter
ax = (synchronicAnalysis(df) / chapterLengthsSeries).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Words Quoted, Normalized')
Out[18]:
In [25]:
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1870, 1910), bins=chapterLocations))
In [75]:
# Look at the top journals.
journals = df['publication_title'].value_counts()
journals.plot(kind='bar')
Out[75]:
Compare journals.
In [76]:
journalDict = {title: synchronicAnalysis(df.loc[df['publication_title'] == title]) for title in journals.index}
In [104]:
byJournal = pd.DataFrame(journalDict).T
# byJournal
In [105]:
byJournal
Out[105]:
In [133]:
cutoff = 500 # Number of words
topJournals = byJournal.loc[byJournal.sum(axis=1) >= cutoff]
otherJournals = byJournal.loc[byJournal.sum(axis=1) <= cutoff]
topJournals.loc['Other'] = otherJournals.sum()
In [134]:
topJournals.sum(axis=1)
Out[134]:
In [146]:
ax = topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')
ax.set_ylabel = "Chapter"
ax.set_xlabel = "Number of Words Quoted"
In [ ]: