convert_rst_discourse_tb, in discourseparsing.convert_rst_discourse_tb:main,Their code includes:
with open(path_dis) as f:
rst_tree_str = f.read().strip()
rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
rst_tree = ParentedTree.fromstring(rst_tree_str)
reformat_rst_tree(rst_tree)
# file mapping from the RSTDTB documentation
file_mapping = {'file1.edus': 'wsj_0764.out.edus',
'file2.edus': 'wsj_0430.out.edus',
'file3.edus': 'wsj_0766.out.edus',
'file4.edus': 'wsj_0778.out.edus',
'file5.edus': 'wsj_2172.out.edus'}
edu = re.sub(r'>\s*', r'', edu).replace('&', '&')
edu = re.sub(r'---', r'--', edu)
edu = edu.replace('. . .', '...')
...
if path_basename == 'file1.edus':
edu = edu.replace('founded by',
'founded by his grandfather.')
elif (path_basename == 'wsj_0660.out.edus'
or path_basename == 'wsj_1368.out.edus'
or path_basename == "wsj_1371.out.edus"):
edu = edu.replace('S.p. A.', 'S.p.A.')
elif path_basename == 'wsj_1329.out.edus':
edu = edu.replace('G.m.b. H.', 'G.m.b.H.')
...
In [1]:
import os
import re
from nltk.tree import ParentedTree
import sexpdata
import discoursegraphs as dg
RSTDT_ROOTDIR = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_TEST_FILE = os.path.join(RSTDT_ROOTDIR, 'TEST', 'wsj_1306.out.dis')
In [2]:
_ptb_paren_mapping = {'(': r'-LRB-',
')': r'-RRB-',
'[': r'-LSB-',
']': r'-RSB-',
'{': r'-LCB-',
'}': r'-RCB-'}
In [3]:
def fix_rst_treebank_tree_str(rst_tree_str):
'''
This removes some unexplained comments in two files that cannot be parsed.
- data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2353.out.dis
- data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2367.out.dis
'''
return re.sub(r'\)//TT_ERR', ')', rst_tree_str)
In [4]:
def convert_parens_in_rst_tree_str(rst_tree_str):
'''
This converts any brackets and parentheses in the EDUs of the RST discourse
treebank to look like Penn Treebank tokens (e.g., -LRB-),
so that the NLTK tree API doesn't crash when trying to read in the
RST trees.
'''
for bracket_type, bracket_replacement in _ptb_paren_mapping.items():
rst_tree_str = \
re.sub('(_![^_(?=!)]*)\\{}([^_(?=!)]*_!)'.format(bracket_type),
'\\g<1>{}\\g<2>'.format(bracket_replacement),
rst_tree_str)
return rst_tree_str
In [5]:
def disfile2tree(dis_filepath):
with open(dis_filepath) as f:
rst_tree_str = f.read().strip()
rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
return ParentedTree.fromstring(rst_tree_str)
In [6]:
tree = disfile2tree(RSTDT_TEST_FILE)
In [7]:
SUBTREE_TYPES = ('Root', 'Nucleus', 'Satellite')
NODE_TYPES = ('leaf', 'span')
def is_tree(tree):
"""
returns true, iff the given element is a (sub)tree (and not a leaf or span node)
Parameters
----------
tree : nltk.tree.ParentedTree
a tree representing a rhetorical structure (or a part of it)
"""
return tree.label() in SUBTREE_TYPES
In [8]:
def get_node_type(tree):
"""
returns the node type (leaf or span) of a subtree (i.e. Nucleus or Satellite)
Parameters
----------
tree : nltk.tree.ParentedTree
a tree representing a rhetorical structure (or a part of it)
"""
node_type = tree[0].label()
assert node_type in ('leaf', 'span')
return node_type
In [9]:
class Mock(object):
ns = 'rst'
def get_node_id(self, nuc_or_sat):
node_type = get_node_type(nuc_or_sat)
if node_type == 'leaf':
leaf_id = nuc_or_sat[0].leaves()[0]
return '{}:{}'.format(self.ns, leaf_id)
else: # node_type == 'span'
span_start = nuc_or_sat[0].leaves()[0]
span_end = nuc_or_sat[0].leaves()[1]
return '{}:span:{}-{}'.format(self.ns, span_start, span_end)
In [10]:
mock = Mock()
print is_tree(tree), get_node_type(tree), get_node_id(mock, tree)
for subelem in tree:
if is_tree(subelem):
print is_tree(subelem), get_node_type(subelem), get_node_id(mock, subelem)
In [11]:
def get_tree_type(tree):
"""returns the type of the (sub)tree: Root, Nucleus or Satellite"""
tree_type = tree.label()
assert tree_type in SUBTREE_TYPES
return tree_type
In [ ]:
In [12]:
def get_relation_type(self, tree):
"""
returns the RST relation type attached to the parent node of an RST relation,
e.g. `span`, `elaboration` or `antithesis`.
"""
return tree[1][0]
In [13]:
def treeprint(tree, tab=0):
for elem in tree:
if is_tree(elem):
print "{}{} {}".format(' '*tab,
get_node_id(mock, elem),
get_relation_type(mock, elem))
treeprint(elem, tab=tab+1)
In [14]:
nuc = tree[1]
print get_tree_type(nuc)
In [15]:
reltype = nuc[1]
In [16]:
print type(reltype[0])
In [17]:
treeprint(tree)
In [18]:
span = tree[1][0]
In [19]:
span.leaves()
Out[19]:
In [20]:
for folder in ('TEST', 'TRAINING'):
for rstfile in dg.util.find_files(os.path.join(RSTDT_ROOTDIR, folder), '*.dis'):
with open(rstfile) as f:
dg.read_dis(rstfile)
In [ ]: