Experiment 4

An analysis of book reviews containing the word "Middlemarch." Uses material from the British Periodicals Online corpus. Thanks to the Stanford Literary Lab for help with this.


In [12]:
# The matcher module is provided by the text_matcher package. 
# This one uses the latest version from the submodule, 
# installed with `pip install --editable .`. 
from text_matcher.matcher import Text, Matcher
import json
from glob import glob
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
import spacy
plt.rcParams["figure.figsize"] = [16, 6]

In [13]:
nlp = spacy.load('en')

In [14]:
# Load the data. 

glob = glob('../txt/bpo/*.json')

jsonContents = [open(f).read() for f in glob]

# Parse the data. 

data = [json.loads(line) for line in jsonContents]

# Load Middlemarch
with open('../middlemarch.txt') as f: 
    rawMM = f.read()

mm = Text(rawMM, 'Middlemarch', nlp)

In [ ]:
for i, article in enumerate(data): 
#     clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['text'], article['record_title'], nlp)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(mm, articleText).match()

In [18]:
# Write output somewhere. 
with open('../txt/e4.json', 'w') as outfile: 
    json.dump(data, outfile)