In [12]:
    
# The matcher module is provided by the text_matcher package. 
# This one uses the latest version from the submodule, 
# installed with `pip install --editable .`. 
from text_matcher.matcher import Text, Matcher
import json
from glob import glob
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
import spacy
plt.rcParams["figure.figsize"] = [16, 6]
    
In [13]:
    
nlp = spacy.load('en')
    
In [14]:
    
# Load the data. 
glob = glob('../txt/bpo/*.json')
jsonContents = [open(f).read() for f in glob]
# Parse the data. 
data = [json.loads(line) for line in jsonContents]
# Load Middlemarch
with open('../middlemarch.txt') as f: 
    rawMM = f.read()
mm = Text(rawMM, 'Middlemarch', nlp)
    
In [ ]:
    
for i, article in enumerate(data): 
#     clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['text'], article['record_title'], nlp)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(mm, articleText).match()
    
In [18]:
    
# Write output somewhere. 
with open('../txt/e4.json', 'w') as outfile: 
    json.dump(data, outfile)