In [12]:
# The matcher module is provided by the text_matcher package.
# This one uses the latest version from the submodule,
# installed with `pip install --editable .`.
from text_matcher.matcher import Text, Matcher
import json
from glob import glob
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
import spacy
plt.rcParams["figure.figsize"] = [16, 6]
In [13]:
nlp = spacy.load('en')
In [14]:
# Load the data.
glob = glob('../txt/bpo/*.json')
jsonContents = [open(f).read() for f in glob]
# Parse the data.
data = [json.loads(line) for line in jsonContents]
# Load Middlemarch
with open('../middlemarch.txt') as f:
rawMM = f.read()
mm = Text(rawMM, 'Middlemarch', nlp)
In [ ]:
for i, article in enumerate(data):
# clear_output()
print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
if 'numMatches' not in article:
articleText = Text(article['text'], article['record_title'], nlp)
article['numMatches'], article['Locations in A'], article['Locations in B'] = \
Matcher(mm, articleText).match()
In [18]:
# Write output somewhere.
with open('../txt/e4.json', 'w') as outfile:
json.dump(data, outfile)