In [2]:
# coding: utf-8
# python2.7
from __future__ import division, print_function
from parsers import CitationWindowParser # use parse and empty_parse methods
from context_parsing_functions import *
from indexing_functions import *
from os import listdir, makedirs
from os.path import isfile, exists
from collections import defaultdict
import re
import subprocess
from multiprocessing import Pool
INDEXES_DIR = "../Retrieval/Indexes/" # ../Retrieval/Indexes/
COLLECTIONS_DIR = "../Documents/PF+PN_Collection/XML/"
DOC_DIR = COLLECTIONS_DIR + "test/" # "PF+PN/"
CONTEXT_DIR = COLLECTIONS_DIR + "C.l080b080a.manual_clean/" # "C.l080b080a.raw", "C.l080b080a.detex"
WINDOW_SPECS = {'Sentence': [(3,3)], 'Word': [(50,50)]}
# {'Sentence': [(0,0),(1,0),(1,1),(2,0),(2,1),(2,2),(3,0),(3,1),(3,2),(3,3),(4,0),(4,1),(4,2),(4,3),(4,4),(5,0),(5,1),(5,2),(5,3),(5,4),(5,5)], 'Word': [(25,0),(25,25),(50,0),(50,25),(50,50),(75,0),(75,25),(75,50),(75,75),(100,0),(100,25),(100,50),(100,75),(100,100)]}
def append_citation_context_windows(isearch_file, doc_dir=DOC_DIR):
"""
Appends sentence and word-based citation context windows
to isearch document content in a new file.
Args:
isearch_file (str)
doc_dir=DOC_DIR (str)
"""
# prepare document text
print("...Reading: {}".format(isearch_file))
doc_text = prepare_document_text(doc_dir + isearch_file)
# prepare citation windows
print("...Preparing citation windows")
citation_windows = prepare_citation_windows(isearch_file)
# write files with document text and citation context windows appended
print("...Writing text files to collections\n")
write_files_with_cit_windows(isearch_file, doc_text, citation_windows)
def prepare_citation_windows(isearch_file, context_dir=CONTEXT_DIR):
"""
Prepares XML-formatted citation context windows
of different sizes for each citation context.
Args:
isearch_file (str)
context_dir=CONTEXT_DIR (str)
Returns:
citation_windows (dict of list of str)
"""
citation_windows = defaultdict(list)
# initiate parsers
parsers = create_context_parsers(WINDOW_SPECS)
# identify if file with citation contexts exists
contexts_file = "{}-citation_contexts.xml".format(find_isearch_id(isearch_file))
if contexts_file in listdir(context_dir):
# prepare citation context texts
citation_contexts = prepare_context_texts(context_dir + contexts_file)
# parse citation contexts into windows
for context in citation_contexts:
for parser in parsers:
stringy_name, window = parser.parse(context)
citation_windows[stringy_name].append(window)
# otherwise...
else:
# make empty windows
for parser in parsers:
stringy_name, window = parser.empty_parse()
citation_windows[stringy_name] = window
return citation_windows
def prepare_document_text(filepath):
"""
Extracts content from an iSearch XML file,
returns content within an <ISEARCHDOC> XML field.
Args:
filepath (str)
Returns:
formatted_text (str)
"""
with open(filepath) as doc:
text = doc.readlines()
begin_index = text.index('<DOC>\n')
text.insert(begin_index + 1, '<ISEARCHDOC>\n')
text[text.index('</DOC>\n')] = '</ISEARCHDOC>\n'
formatted_text = "".join(text)
return formatted_text
def find_isearch_id(filename):
"""
Returns an isearch id when found as an XML filename.
Args:
filename (str)
Returns:
isearch_id (str)
"""
isearch_id = re.search('(P[FN][0-9]{6})\.xml', filename).group(1)
return isearch_id
def write_files_with_cit_windows(isearch_file, doc_text, citation_windows, collection_dir=COLLECTIONS_DIR):
"""
Writes isearch document text and citation context windows
to a new file.
Args:
isearch_file (str)
doc_text (str)
citation_windows (dict of list of str)
collection_dir=COLLECTIONS_DIR (str)
"""
for size, windows in citation_windows.items():
windows = "".join(windows)
write_dir = collection_dir + "PF+PN+C." + size + "/"
if not exists(write_dir):
makedirs(write_dir)
with open(write_dir + isearch_file, 'a') as combo:
combo.write("{}<CITATIONS>\n{}</CITATIONS>\n</DOC>"
.format(doc_text, windows))
return
def main():
p = Pool(1)
p.map(append_citation_context_windows, listdir(DOC_DIR))
index_collections(WINDOW_SPECS, INDEXES_DIR, COLLECTIONS_DIR)
if __name__ == '__main__':
main()
In [3]:
def main():
p = Pool(1)
p.map(append_citation_context_windows, listdir(DOC_DIR))
# create_context_parsers()
# print(prepare_context_texts("../Append_Citation_Contexts/Cits/PF309700-citation_contexts.xml"))
if __name__ == '__main__':
main()
In [4]:
import subprocess
text =""" la la ala \begin{abstract}
I present a toy model for the Berkovits pure spinor superparticle.
It is a $D=1$, $\N=2$ superparticle with no physical degrees of
freedom. We study the cohomology in various ways, in particular
finding an explicit expression for the `b'-field. Finally, we
construct the topological string B-model from a straightforward
generalization of the system.
\end{abstract}
%\pacs{}% PACS, the Physics and Astronomy
% Classification Scheme.
\keywords{BRST, Superstring, Pure spinors}
\maketitle
"""
# "/usr/texbin/detex"
def detex_text(text):
detex = subprocess.Popen(["detex", "-t"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
detex_stdout = detex.communicate(input=text)[0]
return detex_stdout.decode()
detex_text(text)
In [ ]: