In [17]:
# coding: utf-8
# python2.7

from __future__ import division, print_function
from nltk import tokenize # list_of_str = tokenize.sent_tokenize(str)

class CitationWindowParser:
    
    known_parsing = ['Word', 'Sentence']
    
    def __init__(self, parse_type, before, after):
        """
        Creates an object with a type of parsing, either 'Word' or 'Sentence', 
        and taking a specific window size, before and after an in-text citation.
        
        The initiated object can parse a citation context (list of sentences)
        into a window of the specified type and size using the .parse() method.
        
        It can also return an empty window using the .empty_parse() method.
        
        Args:
            parse_type (str)
            before (int)
            after (int)
        
        """
        if parse_type in self.known_parsing:
            self.parse_type = parse_type
        else:
            raise ValueError("Unknown type of parsing: '{}'\n Choose: {}".format(parse_type, self.known_parsing))
            
        self.before = before
        self.after = after

    @property
    def field_name(self):
        return self.format_field_name()

    @property
    def stringy_name(self):
        return self.format_stringy_name()

    def format_field_name(self):
        name_format = "{parse_type}{before}B{after}A".format
        return name_format(parse_type=self.parse_type[0], before=self.before, after=self.after)

    def format_stringy_name(self):
        name_format = "{parse_type}{before}b{after}a".format
        return name_format(parse_type=self.parse_type[0].lower(), before="{:0>3}".format(self.before), after="{:0>3}".format(self.after)) # What's the right way of indenting this?
    
    def get_stringy_name(self):
        return self.stringy_name
    
    def get_field_name(self):
        return self.field_name
    
    def parse(self, text):
        """
        Creates an appropriately sized citation context window within XML field markers.
        
        Args:
            self (obj)
            text (list of str)
            
        Returns:
            self.stringy_name (str)
            window (str)
        
        """
        if self.parse_type == self.known_parsing[0]:
            text = self.split_on_words(text)        
        
        elif self.parse_type == self.known_parsing[1]:
            text = self.split_on_sentences(text)
        
        cit_index = self.locate_citation_index(text)
        window = self.make_context_window(text, cit_index)
        return self.stringy_name, window
    
    def empty_parse(self):
        """
        Creates an empty citation context window within XML field markers.
    
        Args:
            self (obj)
            
        Returns: 
            self.stringy_name (str)
            window (str)
        
        """
        context_format = "<{field_name}>{context}</{field_name}>\n".format
        window = context_format(field_name=self.field_name, context="")
        return self.stringy_name, window
    
    def locate_citation_index(self, context):
        """
        Identifies the context index for an in-text citation.
        
        Args:
            self (obj)
            context (list)
         
        Returns: 
            index (int) or (NoneType)
    
        """

        for index, element in enumerate(context):
            if '<CITATION>' in element:
                return index
            else:
                continue
            return None

    def split_on_words(self, context):
        """
        Splits a context on words.
    
        Args:
            self (obj)
            context (list)
     
        Returns: 
            context (list)
    
        """
        context = "".join(context)
        context = context.split()
        return context
    
    def split_on_sentences(self, context):
        """
        Splits a context on sentences.
    
        Args:
            self (obj)
            context (list)
     
        Returns: 
            context (list)
    
        """
        context = "".join(context)
        context = context.split("\n")
        context = "".join(context)
        context = tokenize.sent_tokenize(context)
        return context
    
    def make_context_window(self, context, cit_index):
        """
        Creates an appropriately sized citation context window.
    
        Args:
            self (obj)
            context (list)
            cit_index (int)
         
        Returns: 
            window (str)
        
        """
        if cit_index == None:
            print("Intext citation missing. Parser: {}\nContext: '{}...'".format(self.stringy_name, " ".join(context[:10]))) # This should be a log message at info level.
            context_format = "<{field_name}>{context}</{field_name}>\n".format
            window = context_format(field_name=self.field_name, context="")
        else: 
            start_index = cit_index - self.before
            end_index = cit_index + self.after + 1
            
            if start_index < 0:
                start_index = 0
            if end_index > len(context):
                end_index =  len(context)
            
            context_format = "<{field_name}>{context}</{field_name}>\n".format
            window = context_format(field_name=self.field_name, context=" ".join(context[start_index:end_index]))
            
        return window

In [20]:
def main():
    parser = CitationWindowParser('Word', 25, 25)
    print(parser.parse(""" It is well known, though, that both numerical simulations and
 analytic descriptions of MHD turbulence (with a strong background
 ``guide field'' like in the corona) indicate that the cascade
 from small to large wavenumber occurs most efficiently for
 modes that do not increase in frequency (i.e., primarily a
 fast cascade in  and negligible transport in
 ).
 In the low- corona, this type of quasi-two-dimensional
 cascade would lead to kinetic Alfven waves and preferential
 electron heating (in ), which is not observed.
 This issue remains a topic of active research, with several possible
 outcomes depending on the behavior of the anisotropic cascade when
 kinetic processes become important
 Ch05,CvB03,Hroy,Mk06,Hw06,<CITATION>Sc07</CITATION>.
 
 
 
 Coronal Heating and Solar Wind Acceleration
 
 Much of the above work dealt with determining the properties
 of the ``microscopic'' fluctuations that dissipate to heat the
 particles in the corona."""))

if __name__ == '__main__':
    main()


('w025b025a', "<W25B25A>This issue remains a topic of active research, with several possible outcomes depending on the behavior of the anisotropic cascade when kinetic processes become important Ch05,CvB03,Hroy,Mk06,Hw06,<CITATION>Sc07</CITATION>. Coronal Heating and Solar Wind Acceleration Much of the above work dealt with determining the properties of the ``microscopic'' fluctuations that dissipate to heat the</W25B25A>\n")

In [86]:


In [ ]: