In [17]:
# coding: utf-8
# python2.7
from __future__ import division, print_function
from nltk import tokenize # list_of_str = tokenize.sent_tokenize(str)
class CitationWindowParser:
known_parsing = ['Word', 'Sentence']
def __init__(self, parse_type, before, after):
"""
Creates an object with a type of parsing, either 'Word' or 'Sentence',
and taking a specific window size, before and after an in-text citation.
The initiated object can parse a citation context (list of sentences)
into a window of the specified type and size using the .parse() method.
It can also return an empty window using the .empty_parse() method.
Args:
parse_type (str)
before (int)
after (int)
"""
if parse_type in self.known_parsing:
self.parse_type = parse_type
else:
raise ValueError("Unknown type of parsing: '{}'\n Choose: {}".format(parse_type, self.known_parsing))
self.before = before
self.after = after
@property
def field_name(self):
return self.format_field_name()
@property
def stringy_name(self):
return self.format_stringy_name()
def format_field_name(self):
name_format = "{parse_type}{before}B{after}A".format
return name_format(parse_type=self.parse_type[0], before=self.before, after=self.after)
def format_stringy_name(self):
name_format = "{parse_type}{before}b{after}a".format
return name_format(parse_type=self.parse_type[0].lower(), before="{:0>3}".format(self.before), after="{:0>3}".format(self.after)) # What's the right way of indenting this?
def get_stringy_name(self):
return self.stringy_name
def get_field_name(self):
return self.field_name
def parse(self, text):
"""
Creates an appropriately sized citation context window within XML field markers.
Args:
self (obj)
text (list of str)
Returns:
self.stringy_name (str)
window (str)
"""
if self.parse_type == self.known_parsing[0]:
text = self.split_on_words(text)
elif self.parse_type == self.known_parsing[1]:
text = self.split_on_sentences(text)
cit_index = self.locate_citation_index(text)
window = self.make_context_window(text, cit_index)
return self.stringy_name, window
def empty_parse(self):
"""
Creates an empty citation context window within XML field markers.
Args:
self (obj)
Returns:
self.stringy_name (str)
window (str)
"""
context_format = "<{field_name}>{context}</{field_name}>\n".format
window = context_format(field_name=self.field_name, context="")
return self.stringy_name, window
def locate_citation_index(self, context):
"""
Identifies the context index for an in-text citation.
Args:
self (obj)
context (list)
Returns:
index (int) or (NoneType)
"""
for index, element in enumerate(context):
if '<CITATION>' in element:
return index
else:
continue
return None
def split_on_words(self, context):
"""
Splits a context on words.
Args:
self (obj)
context (list)
Returns:
context (list)
"""
context = "".join(context)
context = context.split()
return context
def split_on_sentences(self, context):
"""
Splits a context on sentences.
Args:
self (obj)
context (list)
Returns:
context (list)
"""
context = "".join(context)
context = context.split("\n")
context = "".join(context)
context = tokenize.sent_tokenize(context)
return context
def make_context_window(self, context, cit_index):
"""
Creates an appropriately sized citation context window.
Args:
self (obj)
context (list)
cit_index (int)
Returns:
window (str)
"""
if cit_index == None:
print("Intext citation missing. Parser: {}\nContext: '{}...'".format(self.stringy_name, " ".join(context[:10]))) # This should be a log message at info level.
context_format = "<{field_name}>{context}</{field_name}>\n".format
window = context_format(field_name=self.field_name, context="")
else:
start_index = cit_index - self.before
end_index = cit_index + self.after + 1
if start_index < 0:
start_index = 0
if end_index > len(context):
end_index = len(context)
context_format = "<{field_name}>{context}</{field_name}>\n".format
window = context_format(field_name=self.field_name, context=" ".join(context[start_index:end_index]))
return window
In [20]:
def main():
parser = CitationWindowParser('Word', 25, 25)
print(parser.parse(""" It is well known, though, that both numerical simulations and
analytic descriptions of MHD turbulence (with a strong background
``guide field'' like in the corona) indicate that the cascade
from small to large wavenumber occurs most efficiently for
modes that do not increase in frequency (i.e., primarily a
fast cascade in and negligible transport in
).
In the low- corona, this type of quasi-two-dimensional
cascade would lead to kinetic Alfven waves and preferential
electron heating (in ), which is not observed.
This issue remains a topic of active research, with several possible
outcomes depending on the behavior of the anisotropic cascade when
kinetic processes become important
Ch05,CvB03,Hroy,Mk06,Hw06,<CITATION>Sc07</CITATION>.
Coronal Heating and Solar Wind Acceleration
Much of the above work dealt with determining the properties
of the ``microscopic'' fluctuations that dissipate to heat the
particles in the corona."""))
if __name__ == '__main__':
main()
In [86]:
In [ ]: