In [2]:
    
import spacy
import pandas as pd
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
import json
from nltk.corpus import names
from collections import Counter
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
    
In [3]:
    
nlp = spacy.load('en')
    
In [4]:
    
with open('../middlemarch.txt') as f: 
    mm = f.read()
    
In [5]:
    
textALength = len(mm)
    
In [6]:
    
# Get chapter locations
chapterMatches = re.finditer('PRELUDE|CHAPTER|FINALE', mm)
chapterLocations = [match.start() for match in chapterMatches]
chapterLocations.append(textALength) # Add one to account for last chunk. 
len(chapterLocations)
    
    Out[6]:
In [7]:
    
def getChapters(text): 
    chapters = []
    for i, loc in enumerate(chapterLocations): 
        if i != len(chapterLocations)-1: 
            chapter = mm[loc:chapterLocations[i+1]]
            chapters.append(chapter)
    return chapters
    
In [8]:
    
chapters = getChapters(mm)
chapterLengths = [len(nlp(chapter, tag=False, parse=False, entity=False)) for chapter in chapters]
chapterLengthsSeries = pd.Series(chapterLengths)
chapterLengthsSeries.plot(kind='bar', title='Chapter Lengths')
    
    Out[8]:
    
In [9]:
    
with open('../txt/e4.json') as f: 
    rawData = f.read()
    
In [10]:
    
df = pd.read_json(rawData)
    
In [19]:
    
df
    
    Out[19]:
In [150]:
    
df.publication_qualifier.value_counts()
    
    Out[150]:
In [11]:
    
df['Decade'] = df['year'] - (df['year'] % 10)
# df['Locations in A'] = df['matches'].apply(lambda x: x[1])
# df['NumMatches'] = df['matches'].apply(lambda x: x[0])
    
In [12]:
    
sum([len(item) for item in df['Locations in A'].values])
    
    Out[12]:
In [23]:
    
def diachronicAnalysis(df, decades=(1950, 2020), bins=chapterLocations): 
    decades = np.arange(decades[0], decades[1], 10)
    # Make a dictionary of decades. 
    # Values are a list of locations.  
    decadeDict = {}
    for i, row in df.iterrows():
        decade = row['Decade']
        locationsAndWordcounts = row['Locations in A with Wordcounts']
        if decade not in decadeDict: 
            decadeDict[decade] = locationsAndWordcounts
        else: 
            decadeDict[decade] += locationsAndWordcounts
    # Grab the beginnings of quotes. 
    decadeStartsWeights = {decade: [(item[0][0], item[1]) 
                                    for item in loc] 
                    for decade, loc in decadeDict.items()}
    decadesBinned = {decade: 
                 np.histogram([loc[0] for loc in locations], 
                              bins=bins,
                              weights=[loc[1] for loc in locations],
                              range=(0, textALength))[0]
                 for decade, locations in decadeStartsWeights.items() 
                     if decade in decades}
    decadesDF = pd.DataFrame(decadesBinned).T
    #Normalize
    decadesDF = decadesDF.div(decadesDF.max(axis=1), axis=0)
    return decadesDF
def countWords(locRange): 
    """ Counts words in middlemarch, given character ranges. """
    doc = nlp(mm[locRange[0]:locRange[1]], tag=False, parse=False, entity=False)
    return len(doc)
def totalWords(locRangeSet): 
    """ Counts total words in a list of location ranges. """
    return sum([countWords(locRange) for locRange in locRangeSet])    
    
def countsPerSet(locRangeSet): 
    """ Returns an augmented location range set that includes word counts. """
    return [(locRange, countWords(locRange))
             for locRange in locRangeSet]
    
def synchronicAnalysis(df, bins=chapterLocations): 
    locs = df['Locations in A'].values
    locCounts = [(loc, countWords(loc)) for locSet in locs
              for loc in locSet]
    starts = [loc[0][0] for loc in locCounts]
    counts = [loc[1] for loc in locCounts]
    binned = np.histogram(starts, bins=bins, 
                          weights=counts, range=(0, textALength))
    binnedDF = pd.Series(binned[0])
    return binnedDF
def plotDiachronicAnalysis(df): 
    ylabels = [str(int(decade)) for decade in df.index] #+ ['2020']
    plt.pcolor(df, cmap='gnuplot')
    plt.yticks(np.arange(len(df.index)+1), ylabels)
    plt.gca().invert_yaxis()
    plt.ylabel('Decade')
    plt.xlabel('Chapter')
#     plt.title("Frequency of Quotations from George Eliot's Middlemarch in Criticism, By Decade")
    plt.gca().set_xlim((0, len(df.T)))
    plt.colorbar(ticks=[])
    plt.show()
    
def plotSynchronicAnalysis(s): 
    ax = s.plot(kind='bar')
    ax.set_xlabel('Chapter')
    ax.set_ylabel('Number of Words Quoted')
    
In [15]:
    
df['Quoted Words'] = df['Locations in A'].apply(totalWords)
    
In [16]:
    
df['Locations in A with Wordcounts'] = df['Locations in A'].apply(countsPerSet)
    
In [17]:
    
plotSynchronicAnalysis(synchronicAnalysis(df))
    
    
In [18]:
    
# Adjusted for the number of words in each chapter
ax = (synchronicAnalysis(df) / chapterLengthsSeries).plot(kind='bar')
ax.set_xlabel('Chapter')
ax.set_ylabel('Words Quoted, Normalized')
    
    Out[18]:
    
In [25]:
    
plotDiachronicAnalysis(diachronicAnalysis(df, decades=(1870, 1910), bins=chapterLocations))
    
    
In [75]:
    
# Look at the top journals. 
journals = df['publication_title'].value_counts()
journals.plot(kind='bar')
    
    Out[75]:
    
Compare journals.
In [76]:
    
journalDict = {title: synchronicAnalysis(df.loc[df['publication_title'] == title]) for title in journals.index}
    
In [104]:
    
byJournal = pd.DataFrame(journalDict).T
# byJournal
    
In [105]:
    
byJournal
    
    Out[105]:
In [133]:
    
cutoff = 500 # Number of words
topJournals = byJournal.loc[byJournal.sum(axis=1) >= cutoff]
otherJournals = byJournal.loc[byJournal.sum(axis=1) <= cutoff]
topJournals.loc['Other'] = otherJournals.sum()
    
    
In [134]:
    
topJournals.sum(axis=1)
    
    Out[134]:
In [146]:
    
ax = topJournals.T.plot(kind='bar', stacked=True, colormap='nipy_spectral')
ax.set_ylabel = "Chapter" 
ax.set_xlabel = "Number of Words Quoted"
    
    
In [ ]: