Problem: RST-DT files aren't tokenized, their corresponding PTB files are

off-the-shelf tokenizers don't reproduce PTB tokenization exactly
let's try the preprocessing provided by the Educational Testing Service's (ETS) RST discourse parser

ETS discourse-parser

offers a script convert_rst_discourse_tb, in discourseparsing.convert_rst_discourse_tb:main,
which fixes the preprocesses the RST-DT files, so that nltk's tokenizer produces
the same tokenization that PTB uses
NOTE: their code uses Python 3

Their code includes:

some magic preprocessing

with open(path_dis) as f:
    rst_tree_str = f.read().strip()
    rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
    rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
    rst_tree = ParentedTree.fromstring(rst_tree_str)
    reformat_rst_tree(rst_tree)

some file mappings

# file mapping from the RSTDTB documentation
file_mapping = {'file1.edus': 'wsj_0764.out.edus',
                'file2.edus': 'wsj_0430.out.edus',
                'file3.edus': 'wsj_0766.out.edus',
                'file4.edus': 'wsj_0778.out.edus',
                'file5.edus': 'wsj_2172.out.edus'}

some whitespace trickery

edu = re.sub(r'>\s*', r'', edu).replace('&amp;', '&')
edu = re.sub(r'---', r'--', edu)
edu = edu.replace('. . .', '...')
...

and a lot of "annoying edge cases"

if path_basename == 'file1.edus':
    edu = edu.replace('founded by',
                      'founded by his grandfather.')
elif (path_basename == 'wsj_0660.out.edus'
      or path_basename == 'wsj_1368.out.edus'
      or path_basename == "wsj_1371.out.edus"):
    edu = edu.replace('S.p. A.', 'S.p.A.')
elif path_basename == 'wsj_1329.out.edus':
    edu = edu.replace('G.m.b. H.', 'G.m.b.H.')
...

TODO: parse all files with discoursegraphs, then fix the "annoying edge cases"



In [1]:

    
import os
import re

from nltk.tree import ParentedTree
import sexpdata

import discoursegraphs as dg

RSTDT_ROOTDIR = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_TEST_FILE = os.path.join(RSTDT_ROOTDIR, 'TEST', 'wsj_1306.out.dis')









    



Couldn't import dot_parser, loading of dot files will not be possible.



In [2]:

    
_ptb_paren_mapping = {'(': r'-LRB-',
                      ')': r'-RRB-',
                      '[': r'-LSB-',
                      ']': r'-RSB-',
                      '{': r'-LCB-',
                      '}': r'-RCB-'}



In [3]:

    
def fix_rst_treebank_tree_str(rst_tree_str):
    '''
    This removes some unexplained comments in two files that cannot be parsed.
    - data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2353.out.dis
    - data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2367.out.dis
    '''
    return re.sub(r'\)//TT_ERR', ')', rst_tree_str)



In [4]:

    
def convert_parens_in_rst_tree_str(rst_tree_str):
    '''
    This converts any brackets and parentheses in the EDUs of the RST discourse
    treebank to look like Penn Treebank tokens (e.g., -LRB-),
    so that the NLTK tree API doesn't crash when trying to read in the
    RST trees.
    '''
    for bracket_type, bracket_replacement in _ptb_paren_mapping.items():
        rst_tree_str = \
            re.sub('(_![^_(?=!)]*)\\{}([^_(?=!)]*_!)'.format(bracket_type),
                   '\\g<1>{}\\g<2>'.format(bracket_replacement),
                   rst_tree_str)
    return rst_tree_str



In [5]:

    
def disfile2tree(dis_filepath):
    with open(dis_filepath) as f:
        rst_tree_str = f.read().strip()
        rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
        rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
        return ParentedTree.fromstring(rst_tree_str)



In [6]:

    
tree = disfile2tree(RSTDT_TEST_FILE)



In [7]:

    
SUBTREE_TYPES = ('Root', 'Nucleus', 'Satellite')
NODE_TYPES = ('leaf', 'span')

def is_tree(tree):
    """
    returns true, iff the given element is a (sub)tree (and not a leaf or span node)
    
    Parameters
    ----------
    tree : nltk.tree.ParentedTree
        a tree representing a rhetorical structure (or a part of it)
    """
    return tree.label() in SUBTREE_TYPES



In [8]:

    
def get_node_type(tree):
    """
    returns the node type (leaf or span) of a subtree (i.e. Nucleus or Satellite)

    Parameters
    ----------
    tree : nltk.tree.ParentedTree
        a tree representing a rhetorical structure (or a part of it)    
    """
    node_type = tree[0].label()
    assert node_type in ('leaf', 'span')
    return node_type



In [9]:

    
class Mock(object):
    ns = 'rst'

def get_node_id(self, nuc_or_sat):
    node_type = get_node_type(nuc_or_sat)
    if node_type == 'leaf':
        leaf_id = nuc_or_sat[0].leaves()[0]
        return '{}:{}'.format(self.ns, leaf_id)
    else: # node_type == 'span'
        span_start = nuc_or_sat[0].leaves()[0]
        span_end = nuc_or_sat[0].leaves()[1]
        return '{}:span:{}-{}'.format(self.ns, span_start, span_end)



In [10]:

    
mock = Mock()

print is_tree(tree), get_node_type(tree), get_node_id(mock, tree)
for subelem in tree:
    if is_tree(subelem):
        print is_tree(subelem), get_node_type(subelem), get_node_id(mock, subelem)









    



True span rst:span:1-47
True span rst:span:1-20
True span rst:span:21-47



In [11]:

    
def get_tree_type(tree):
    """returns the type of the (sub)tree: Root, Nucleus or Satellite"""
    tree_type = tree.label()
    assert tree_type in SUBTREE_TYPES
    return tree_type



In [ ]:



In [12]:

    
def get_relation_type(self, tree):
    """
    returns the RST relation type attached to the parent node of an RST relation,
    e.g. `span`, `elaboration` or `antithesis`.
    """
    return tree[1][0]



In [13]:

    
def treeprint(tree, tab=0):
    for elem in tree:
        if is_tree(elem):
            print "{}{} {}".format('   '*tab,
                                   get_node_id(mock, elem),
                                   get_relation_type(mock, elem))
            treeprint(elem, tab=tab+1)



In [14]:

    
nuc = tree[1]
print get_tree_type(nuc)









    



Nucleus



In [15]:

    
reltype = nuc[1]



In [16]:

    
print type(reltype[0])









    



<type 'str'>



In [17]:

    
treeprint(tree)









    



rst:span:1-20 span
   rst:span:1-14 span
      rst:span:1-8 span
         rst:span:1-4 Inverted-Sequence
            rst:span:1-3 span
               rst:1 attribution
               rst:span:2-3 span
                  rst:2 span
                  rst:3 elaboration-object-attribute-e
            rst:4 elaboration-additional
         rst:span:5-8 Inverted-Sequence
            rst:span:5-6 span
               rst:5 attribution
               rst:6 span
            rst:span:7-8 reason
               rst:7 attribution
               rst:8 span
      rst:span:9-14 circumstance
         rst:span:9-11 span
            rst:9 span
            rst:span:10-11 elaboration-object-attribute-e
               rst:10 span
               rst:11 elaboration-object-attribute-e
         rst:span:12-14 elaboration-additional
            rst:12 span
            rst:span:13-14 consequence-s
               rst:13 span
               rst:14 elaboration-object-attribute-e
   rst:span:15-20 reason
      rst:span:15-17 span
         rst:15 attribution
         rst:span:16-17 span
            rst:16 span
            rst:17 elaboration-object-attribute-e
      rst:span:18-20 explanation-argumentative
         rst:span:18-19 span
            rst:18 span
            rst:19 reason
         rst:20 attribution
rst:span:21-47 evaluation-s
   rst:span:21-30 span
      rst:span:21-27 span
         rst:span:21-23 span
            rst:21 attribution
            rst:span:22-23 span
               rst:22 antithesis
               rst:23 span
         rst:span:24-27 elaboration-additional
            rst:span:24-25 antithesis
               rst:24 span
               rst:25 attribution
            rst:span:26-27 span
               rst:26 span
               rst:27 elaboration-object-attribute-e
      rst:span:28-30 elaboration-additional
         rst:28 span
         rst:span:29-30 temporal-same-time
            rst:29 span
            rst:30 elaboration-set-member-e
   rst:span:31-47 elaboration-general-specific
      rst:span:31-36 Contrast
         rst:span:31-33 span
            rst:31 attribution
            rst:span:32-33 span
               rst:32 span
               rst:33 temporal-after
         rst:span:34-36 comment
            rst:span:34-35 span
               rst:34 span
               rst:35 attribution
            rst:36 elaboration-object-attribute-e
      rst:span:37-47 Contrast
         rst:37 span
         rst:span:38-47 elaboration-general-specific
            rst:span:38-40 List
               rst:38 attribution
               rst:span:39-40 span
                  rst:39 Sequence
                  rst:40 Sequence
            rst:span:41-42 List
               rst:41 attribution
               rst:42 span
            rst:span:43-44 List
               rst:43 attribution
               rst:44 span
            rst:span:45-47 List
               rst:45 attribution
               rst:span:46-47 span
                  rst:46 span
                  rst:47 concession



In [18]:

    
span = tree[1][0]



In [19]:

    
span.leaves()









    Out[19]:





['1', '20']



In [20]:

    
for folder in ('TEST', 'TRAINING'):
    for rstfile in dg.util.find_files(os.path.join(RSTDT_ROOTDIR, folder), '*.dis'):
        with open(rstfile) as f:
            dg.read_dis(rstfile)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-972a2126c2d0> in <module>()
      2     for rstfile in dg.util.find_files(os.path.join(RSTDT_ROOTDIR, folder), '*.dis'):
      3         with open(rstfile) as f:
----> 4             dg.read_dis(rstfile)

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in __init__(self, dis_filepath, name, namespace, tokenize, precedence)
     76         self.tokens = []
     77         self.rst_tree = self.disfile2tree(dis_filepath)
---> 78         self.parse_rst_tree(self.rst_tree)
     79 
     80     def disfile2tree(self, dis_filepath):

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_rst_tree(self, rst_tree, indent)
     92             span, children = rst_tree[0], rst_tree[1:]
     93             for child in children:
---> 94                 self.parse_rst_tree(child, indent=indent+1)
     95 
     96         else: # tree_type in ('Nucleus', 'Satellite')

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_rst_tree(self, rst_tree, indent)
    144 
    145                 for child in children:
--> 146                     self.parse_rst_tree(child, indent=indent+1)
    147 
    148     def get_child_types(self, children):

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_rst_tree(self, rst_tree, indent)
    141                                   edge_type=EdgeTypes.dominance_relation)
    142                 else:
--> 143                     raise ValueError("Unexpected child combinations: {}\n".format(child_types))
    144 
    145                 for child in children:

ValueError: Unexpected child combinations: defaultdict(<type 'list'>, {'Satellite': [0], 'Nucleus': []})



In [ ]: