In [ ]:

    
# coding: utf-8
# python2.7

from __future__ import division, print_function
from parsers import CitationWindowParser
# import subprocess

def create_context_parsers(window_specs):
    """
    Creates a list of parser objects.
    
    Args:
        window_specs (dict with lists of tuples)
        
    Returns:
        all_parsers (list)
        
    """
    all_parsers = []
    # iterate through window types and sizes
    for window_type, window_sizes in window_specs.items():
        for window_size in window_sizes:
            before = window_size[0]
            after = window_size[1]
            parser = CitationWindowParser(window_type, before, after) # initiate new parser object
            all_parsers.append(parser) # append parser to list of parsers
    return all_parsers

def prepare_context_texts(filepath):
    """
    Split XML document with citation contexts into a list
    containing the text of each context (str).
    
    Args:
        filepath (str)
     
    Returns: 
        texts (list of list of str)
        
    """
    texts = []
    xml_contexts = split_contexts(filepath)
    for context in xml_contexts:
        text = delimit_text(context)    
        if text is not None:
            # detexed_text = detex_text(text)
            texts.append(text)
    return texts

def split_contexts(filepath):
    """
    Splits XML document with citation contexts into a list of contexts.
    
    Args:
        filepath (str)
     
    Returns: 
        contexts (list of str)
    
    """
    # with codecs.open(context_filepath, mode="r", encoding='utf-8', errors='ignore') as cc:
    print("...opening context file: {}".format(filepath))
    with open(filepath) as cc:
        contexts = "".join(cc.readlines())
    
    contexts = contexts.split('<CITEDINTEXTID>')
    return contexts

def delimit_text(context):
    """
    Splits a single context into a list on newlines, 
    returns context lines.
    
    Args:
        context (str)
     
    Returns:
        context (list of str)
    
    """
    context = context.split('\n')
#     print("{}\n".format(context))
    if '<CONTEXT>' in context:
        start_index = context.index('<CONTEXT>') + 1
        end_index = context.index('</CONTEXT>')
        return context[start_index:end_index]
    else:
        return None

# def detex_text(text):
#     detex_f = "../Documents/PF+PN_Collection/XML/detex_file.xml"
#     text = " ".join(text)
#     with open("../Documents/PF+PN_Collection/XML/tmp.xml", 'w') as temp_file:
#         temp_file.write(text)
#     detex = subprocess.Popen(["./detex-2.8/detex", "-t", "../Documents/PF+PN_Collection/XML/tmp.xml"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
#     detex_stdout = detex.communicate()[0]
#     # with open(detex_f, 'r') as detex_file:
# #         text = detex_file.readlines()
#     return detex_stdout.decode('utf-8', 'replace')
#     # return text