In [2]:
# coding: utf-8 
# python2.7

from __future__ import division, print_function
from parsers import CitationWindowParser # use parse and empty_parse methods
from context_parsing_functions import *
from indexing_functions import *
from os import listdir, makedirs
from os.path import isfile, exists 
from collections import defaultdict
import re
import subprocess
from multiprocessing import Pool

INDEXES_DIR = "../Retrieval/Indexes/" # ../Retrieval/Indexes/
COLLECTIONS_DIR = "../Documents/PF+PN_Collection/XML/" 
DOC_DIR = COLLECTIONS_DIR + "test/"   # "PF+PN/"
CONTEXT_DIR = COLLECTIONS_DIR + "C.l080b080a.manual_clean/"  # "C.l080b080a.raw", "C.l080b080a.detex"
WINDOW_SPECS = {'Sentence': [(3,3)], 'Word': [(50,50)]}
# {'Sentence': [(0,0),(1,0),(1,1),(2,0),(2,1),(2,2),(3,0),(3,1),(3,2),(3,3),(4,0),(4,1),(4,2),(4,3),(4,4),(5,0),(5,1),(5,2),(5,3),(5,4),(5,5)], 'Word': [(25,0),(25,25),(50,0),(50,25),(50,50),(75,0),(75,25),(75,50),(75,75),(100,0),(100,25),(100,50),(100,75),(100,100)]}

def append_citation_context_windows(isearch_file, doc_dir=DOC_DIR): 
    """
    Appends sentence and word-based citation context windows 
    to isearch document content in a new file.
    
    Args:
        isearch_file (str) 
        doc_dir=DOC_DIR (str)
    
    """
    # prepare document text
    print("...Reading: {}".format(isearch_file))
    doc_text = prepare_document_text(doc_dir + isearch_file)    
    # prepare citation windows
    print("...Preparing citation windows")
    citation_windows = prepare_citation_windows(isearch_file)
    # write files with document text and citation context windows appended
    print("...Writing text files to collections\n")
    write_files_with_cit_windows(isearch_file, doc_text, citation_windows)

def prepare_citation_windows(isearch_file, context_dir=CONTEXT_DIR):
    """
    Prepares XML-formatted citation context windows 
    of different sizes for each citation context.
    
    Args:
        isearch_file (str)
        context_dir=CONTEXT_DIR (str)
    
    Returns:
        citation_windows (dict of list of str)
    """
    citation_windows = defaultdict(list)
    
    # initiate parsers
    parsers = create_context_parsers(WINDOW_SPECS)
    
    # identify if file with citation contexts exists 
    contexts_file = "{}-citation_contexts.xml".format(find_isearch_id(isearch_file))

    if contexts_file in listdir(context_dir):
        # prepare citation context texts
        citation_contexts = prepare_context_texts(context_dir + contexts_file)
        # parse citation contexts into windows
        for context in citation_contexts:
            for parser in parsers:
                stringy_name, window = parser.parse(context)
                citation_windows[stringy_name].append(window)
    # otherwise...
    else:
        # make empty windows
        for parser in parsers:
                stringy_name, window = parser.empty_parse()
                citation_windows[stringy_name] = window
    return citation_windows
    
       
def prepare_document_text(filepath):
    """
    Extracts content from an iSearch XML file, 
    returns content within an <ISEARCHDOC> XML field.
    
    Args:
        filepath (str) 
     
    Returns: 
        formatted_text (str)
    
    """
    with open(filepath) as doc:
        text = doc.readlines()
    
    begin_index = text.index('<DOC>\n')
    text.insert(begin_index + 1, '<ISEARCHDOC>\n')
    text[text.index('</DOC>\n')] = '</ISEARCHDOC>\n'
    formatted_text = "".join(text)
    return formatted_text

def find_isearch_id(filename):
    """
    Returns an isearch id when found as an XML filename.
    
    Args:
        filename (str) 
     
    Returns: 
        isearch_id (str)
    
    """
    isearch_id = re.search('(P[FN][0-9]{6})\.xml', filename).group(1)
    return isearch_id    

def write_files_with_cit_windows(isearch_file, doc_text, citation_windows, collection_dir=COLLECTIONS_DIR):
    """
    Writes isearch document text and citation context windows 
    to a new file.
    
    Args:
        isearch_file (str) 
        doc_text (str)
        citation_windows (dict of list of str)
        collection_dir=COLLECTIONS_DIR (str)
    
    """
    for size, windows in citation_windows.items():
        windows = "".join(windows)
        write_dir = collection_dir + "PF+PN+C." + size + "/"
        if not exists(write_dir):
            makedirs(write_dir)
        with open(write_dir + isearch_file, 'a') as combo:
            combo.write("{}<CITATIONS>\n{}</CITATIONS>\n</DOC>"
                        .format(doc_text, windows)) 
    return
    
def main():
    p = Pool(1)
    p.map(append_citation_context_windows, listdir(DOC_DIR))
    
    index_collections(WINDOW_SPECS, INDEXES_DIR, COLLECTIONS_DIR)

if __name__ == '__main__':
    main()

In [3]:
def main():
    p = Pool(1)
    p.map(append_citation_context_windows, listdir(DOC_DIR))
#     create_context_parsers()
#     print(prepare_context_texts("../Append_Citation_Contexts/Cits/PF309700-citation_contexts.xml"))

if __name__ == '__main__':
    main()


...Reading: PF309692.xml
...preparing citation windows
...writing files

...Reading: PF309700.xml
...preparing citation windows
...opening context file: ../Append_Citation_Contexts/Cits/PF309700-citation_contexts.xml
...writing files

...Reading: PF309716.xml
...preparing citation windows
...opening context file: ../Append_Citation_Contexts/Cits/PF309716-citation_contexts.xml
...writing files

...Reading: PN309657.xml
...preparing citation windows
...opening context file: ../Append_Citation_Contexts/Cits/PN309657-citation_contexts.xml
...writing files

...Reading: PN309677.xml
...preparing citation windows
...writing files

...Reading: PN309685.xml
...preparing citation windows
...opening context file: ../Append_Citation_Contexts/Cits/PN309685-citation_contexts.xml
...writing files


In [4]:
import subprocess

text =""" la la ala \begin{abstract}
 I present a toy model for the Berkovits pure spinor superparticle.
 It is a $D=1$, $\N=2$ superparticle with no physical degrees of
 freedom. We study the cohomology in various ways, in particular
 finding an explicit expression for the `b'-field. Finally, we
 construct the topological string B-model from a straightforward
 generalization of the system.
 \end{abstract}
 
 %\pacs{}% PACS, the Physics and Astronomy
                              % Classification Scheme.
 
 \keywords{BRST, Superstring, Pure spinors}
 
 \maketitle
 """

# "/usr/texbin/detex"
def detex_text(text):
    detex = subprocess.Popen(["detex", "-t"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
    detex_stdout = detex.communicate(input=text)[0]
    return detex_stdout.decode()

detex_text(text)


 la la ala eginabstract
 I present a toy model for the Berkovits pure spinor superparticle.
 It is a ,  superparticle with no physical degrees of
 freedom. We study the cohomology in various ways, in particular
 finding an explicit expression for the `b'-field. Finally, we
 construct the topological string B-model from a straightforward
 generalization of the system.
 abstract
 
 
                              
 
 BRST, Superstring, Pure spinors
 
 
 

In [ ]: