Experiment 2A

Using the rewritten text matcher.


In [1]:
from matcher import Text, Matcher
import json
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]

In [5]:
# Load the data. 
with open('../txt/middlemarch.json') as f: 
    rawCriticism = f.readlines()

# Parse the data. 
data = [json.loads(line) for line in rawCriticism]

# Load Middlemarch
with open('../middlemarch.txt') as f: 
    rawMM = f.read()

mm = Text(rawMM, 'Middlemarch')

In [6]:
for i, article in enumerate(data): 
    clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['ocr'], article['id'])
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(mm, articleText).match()


 Matching article 6068 of 6069

In [7]:
# Write output somewhere. 
with open('../txt/e2a.json', 'w') as outfile: 
    json.dump(data, outfile)