In [ ]:
# coding: utf-8
# python2.7
from __future__ import division, print_function
from parsers import CitationWindowParser
# import subprocess
def create_context_parsers(window_specs):
"""
Creates a list of parser objects.
Args:
window_specs (dict with lists of tuples)
Returns:
all_parsers (list)
"""
all_parsers = []
# iterate through window types and sizes
for window_type, window_sizes in window_specs.items():
for window_size in window_sizes:
before = window_size[0]
after = window_size[1]
parser = CitationWindowParser(window_type, before, after) # initiate new parser object
all_parsers.append(parser) # append parser to list of parsers
return all_parsers
def prepare_context_texts(filepath):
"""
Split XML document with citation contexts into a list
containing the text of each context (str).
Args:
filepath (str)
Returns:
texts (list of list of str)
"""
texts = []
xml_contexts = split_contexts(filepath)
for context in xml_contexts:
text = delimit_text(context)
if text is not None:
# detexed_text = detex_text(text)
texts.append(text)
return texts
def split_contexts(filepath):
"""
Splits XML document with citation contexts into a list of contexts.
Args:
filepath (str)
Returns:
contexts (list of str)
"""
# with codecs.open(context_filepath, mode="r", encoding='utf-8', errors='ignore') as cc:
print("...opening context file: {}".format(filepath))
with open(filepath) as cc:
contexts = "".join(cc.readlines())
contexts = contexts.split('<CITEDINTEXTID>')
return contexts
def delimit_text(context):
"""
Splits a single context into a list on newlines,
returns context lines.
Args:
context (str)
Returns:
context (list of str)
"""
context = context.split('\n')
# print("{}\n".format(context))
if '<CONTEXT>' in context:
start_index = context.index('<CONTEXT>') + 1
end_index = context.index('</CONTEXT>')
return context[start_index:end_index]
else:
return None
# def detex_text(text):
# detex_f = "../Documents/PF+PN_Collection/XML/detex_file.xml"
# text = " ".join(text)
# with open("../Documents/PF+PN_Collection/XML/tmp.xml", 'w') as temp_file:
# temp_file.write(text)
# detex = subprocess.Popen(["./detex-2.8/detex", "-t", "../Documents/PF+PN_Collection/XML/tmp.xml"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
# detex_stdout = detex.communicate()[0]
# # with open(detex_f, 'r') as detex_file:
# # text = detex_file.readlines()
# return detex_stdout.decode('utf-8', 'replace')
# # return text