*.dis
files, the *.lisp.name
and *.step.name
may be broken, tooThe RST Discourse Treebank contains 385 WSJ articles from PTB with Rhetorical Structure Theory (RST) annotations.
The following information was taken from the RST-DT documentation:
This directory contains 385 Wall Street Journal articles, broken into TRAINING (347 documents) and TEST (38 documents) sub-directories.
Filenames are in one of two forms:
The 5 files named file# were identified as the following filenames in Treebank-2:
(More information is available in a compressed file via ftp, which provides the relationship between the 2,499 PTB filenames and the corresponding WSJ DOCNO strings in TIPSTER.)
A directory with three files:
<docno>.lisp.name
- discourse structure created by a human judge for a text.<docno>.step.name
- list of all human actions taken##
-- a file with an integer as its name - temp file;All annotations were produced using a discourse annotation tool that can be downloaded from http://www.isi.edu/~marcu/discourse.
The files in the .rst directories are provided only to enable interested users to visualize and print in a convenient format the discourse annotations in the corpus.
<docno>
.step
and .lisp
IMPORTANT NOTE: The .lisp files may contain errors introduced by the discourse annotation tool. Please use the .lisp and .step files only for visualizing the trees.
Use the .dis files for training/testing purposes (the mapping program that produced the .dis file was written so as to eliminate the errors introduced by the annotation tool).
<docno>.edus
- edus (elementary discourse units) listed line by line.This directory contains the same types of files as the subdirectory RSTtrees-WSJ-main-1.0, for 53 documents which were reviewed by a second analyst.
In [8]:
import os
import sys
import glob
import nltk
RSTDT_MAIN_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_DOUBLE_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-double-1.0')
RSTDT_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-tokenized')
RSTDT_TEST_FILE = os.path.join(RSTDT_MAIN_ROOT, 'TEST', 'wsj_1306.out.dis')
RSTDT_TOKENIZED_TEST_FILE = os.path.join(RSTDT_TOKENIZED_ROOT, 'TEST', 'wsj_1306.out.dis')
PTB_WSJ_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
In [3]:
FILES_UNPARSABLE_WITH_NLTK = set([
'/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_1107.out.dis',
'/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2353.out.dis',
'/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2367.out.dis'])
In [4]:
def get_nodelabel(node):
"""returns the node label of an nltk Tree or one of its subtrees"""
if isinstance(node, nltk.tree.Tree):
return node.label()
elif isinstance(node, unicode):
return node.encode('utf-8')
else:
raise ValueError("Unexpected node type: {}, {}".format(type(node), node))
In [5]:
from nltk.corpus.reader import BracketParseCorpusReader
def parse_rstfile_nltk(rst_filepath):
"""parse a *.dis RST file into an nltk.tree.Tree"""
rst_path, rst_filename = os.path.split(rst_filepath)
parsed_doc = BracketParseCorpusReader(rst_path, [rst_filename])
parsed_sents_iter = parsed_doc.parsed_sents()
return parsed_sents_iter[0] # there's only one tree in a *.dis
In [6]:
from collections import defaultdict
def nested_tree_count(tree, result_dict=None):
if not result_dict:
result_dict = defaultdict(lambda : defaultdict(int))
for i, subtree in enumerate(tree):
if isinstance(subtree, nltk.tree.Tree) and subtree.label() in ('Nucleus', 'Satellite'):
rhs = tuple([get_nodelabel(st) for st in subtree])
result_dict[get_nodelabel(subtree)][rhs] += 1
if rhs[0] == u'leaf' and len(rhs) != 3: # (leaf, rel2par, text)
raise ValueError('Badly escaped s-expression\n{}\n'.format(subtree))
nested_tree_count(subtree, result_dict)
( Satellite (span 22 28) (rel2par elaboration-set-member-e)
( Nucleus (span 22 23) (rel2par span)
( Nucleus (leaf 22) (rel2par span) (text _!Canadian Imperial Bank of Commerce_!) )
( Satellite (leaf 23) (rel2par elaboration-additional) (text _!(Canada) --_!) )
)
In [6]:
# BADLY_ESCAPED_FILES = set()
# for folder in ('TEST', 'TRAINING'):
# for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
# if rst_fpath not in FILES_UNPARSABLE_WITH_NLTK:
# rst_tree = parse_rstfile_nltk(rst_fpath)
# try:
# nested_tree_count(rst_tree)
# except ValueError as e:
# BADLY_ESCAPED_FILES.add(rst_fpath)
# len(BADLY_ESCAPED_FILES) # 22 files
In [7]:
import sys
import traceback
import sexpdata
def parse_rstfile_sexpdata(rst_filepath):
with open(rst_filepath) as rstfile:
try:
return sexpdata.load(rstfile)
except sexpdata.ExpectClosingBracket as e:
raise ValueError(u"{}\n{}\n\n".format(rst_fpath, e))
except sexpdata.ExpectNothing as e:
error_msg = e.args[0][:100] # complete msg would contain the whole document
raise ValueError(u"{}\n{}...\n\n".format(rst_fpath, e.args[0][:100]))
except AssertionError as e:
raise ValueError(u"{}\n{}\n\n".format(rst_fpath, traceback.format_exc()))
except AttributeError as e:
raise ValueError(u"{}\n{}\n\n".format(rst_fpath, traceback.format_exc()))
In [8]:
# FILES_UNPARSABLE_WITH_SEXPDATA = set()
# for folder in ('TEST', 'TRAINING'):
# for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
# try:
# parse_rstfile_sexpdata(rst_fpath)
# except ValueError as e:
# FILES_UNPARSABLE_WITH_SEXPDATA.add(rst_fpath)
# len(FILES_UNPARSABLE_WITH_SEXPDATA) # 113 unparsable files
In [9]:
# ALL_UNPARSABLE_FILES = FILES_UNPARSABLE_WITH_NLTK.union(FILES_UNPARSABLE_WITH_SEXPDATA).union(BADLY_ESCAPED_FILES)
# len(ALL_UNPARSABLE_FILES) # 124 unparsable files
In [10]:
sexp_tree = parse_rstfile_sexpdata(RSTDT_TEST_FILE)
# a list that contains Symbol instances (and lists of Symbol instances and integers)
In [11]:
root = sexp_tree[0]
In [12]:
print sexp_tree[1]
print sexp_tree[1][0]
print sexp_tree[1][0].value()
In [13]:
nuc_tree = sexp_tree[2]
print nuc_tree[1][0].value()
print nuc_tree[1][1], nuc_tree[1][2]
for i, e in enumerate(nuc_tree):
print i, e, '\n'
//TT_ERR
stringsRSTtrees-WSJ-main-1.0-tokenized
directoryarne@ziegelstein ~/repos/rst_discourse_treebank $ ack-grep -cl "//TT_ERR"
data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2367.out.dis:102
data/RSTtrees-WSJ-main-1.0/TRAINING/wsj_2353.out.dis:53
data/RSTtrees-WSJ-main-1.0-tokenized/TRAINING/wsj_2367.out.dis:102
data/RSTtrees-WSJ-main-1.0-tokenized/TRAINING/wsj_2353.out.dis:53
In [14]:
import discoursegraphs as dg
from collections import Counter
class RSTLispDocumentGraph(dg.DiscourseDocumentGraph):
"""
A directed graph with multiple edges (based on a networkx
MultiDiGraph) that represents the rhetorical structure of a
document.
Attributes
----------
name : str
name, ID of the document or file name of the input file
ns : str
the namespace of the document (default: rst)
root : str
name of the document root node ID
tokens : list of str
sorted list of all token node IDs contained in this document graph
"""
def __init__(self, dis_filepath, name=None, namespace='rst',
tokenize=True, precedence=False):
"""
Creates an RSTLispDocumentGraph from a Rhetorical Structure *.dis file and adds metadata
to it.
Parameters
----------
dis_filepath : str
absolute or relative path to the Rhetorical Structure *.dis file to be
parsed.
name : str or None
the name or ID of the graph to be generated. If no name is
given, the basename of the input file is used.
namespace : str
the namespace of the document (default: rst)
precedence : bool
If True, add precedence relation edges
(root precedes token1, which precedes token2 etc.)
"""
# super calls __init__() of base class DiscourseDocumentGraph
super(RSTLispDocumentGraph, self).__init__()
self.name = name if name else os.path.basename(dis_filepath)
self.ns = namespace
self.root = 0
self.add_node(self.root, layers={self.ns}, label=self.ns+':root_node')
if 'discoursegraph:root_node' in self:
self.remove_node('discoursegraph:root_node')
self.tokenized = tokenize
self.tokens = []
self.rst_tree = parse_rstfile_sexpdata(dis_filepath)
self.parse_rst_tree(self.rst_tree)
def parse_rst_tree(self, rst_tree, indent=0):
tree_type = self.get_tree_type(rst_tree)
assert tree_type in ('Root', 'Nucleus', 'Satellite')
if tree_type == 'Root':
span, children = rst_tree[1], rst_tree[2:]
for child in children:
self.parse_rst_tree(child, indent=indent+1)
else: # tree_type in ('Nucleus', 'Satellite')
node_id = self.get_node_id(rst_tree)
node_type = self.get_node_type(rst_tree)
relation_type = self.get_relation_type(rst_tree)
if node_type == 'leaf':
edu_text = self.get_edu_text(rst_tree[3])
self.add_node(node_id, attr_dict={self.ns+':text': edu_text,
'label': u'{}: {}'.format(node_id, edu_text[:20])})
if self.tokenized:
edu_tokens = edu_text.split()
for i, token in enumerate(edu_tokens):
token_node_id = '{}_{}'.format(node_id, i)
self.tokens.append(token_node_id)
self.add_node(token_node_id, attr_dict={self.ns+':token': token,
'label': token})
self.add_edge(node_id, '{}_{}'.format(node_id, i))
else: # node_type == 'span'
self.add_node(node_id, attr_dict={self.ns+':rel_type': relation_type,
self.ns+':node_type': node_type})
children = rst_tree[3:]
child_types = self.get_child_types(children)
expected_child_types = set(['Nucleus', 'Satellite'])
unexpected_child_types = set(child_types).difference(expected_child_types)
assert not unexpected_child_types, \
"Node '{}' contains unexpected child types: {}\n".format(node_id, unexpected_child_types)
if 'Satellite' not in child_types:
# span only contains nucleii -> multinuc
for child in children:
child_node_id = self.get_node_id(child)
self.add_edge(node_id, child_node_id, attr_dict={self.ns+':rel_type': relation_type})
elif len(child_types['Satellite']) == 1 and len(child_types['Nucleus']) == 1:
# standard RST relation, where one satellite is dominated by one nucleus
nucleus_index = child_types['Nucleus'][0]
satellite_index = child_types['Satellite'][0]
nucleus_node_id = self.get_node_id(children[nucleus_index])
satellite_node_id = self.get_node_id(children[satellite_index])
self.add_edge(node_id, nucleus_node_id, attr_dict={self.ns+':rel_type': 'span'},
edge_type=dg.EdgeTypes.spanning_relation)
self.add_edge(nucleus_node_id, satellite_node_id,
attr_dict={self.ns+':rel_type': relation_type},
edge_type=dg.EdgeTypes.dominance_relation)
else:
raise ValueError("Unexpected child combinations: {}\n".format(child_types))
for child in children:
self.parse_rst_tree(child, indent=indent+1)
def get_child_types(self, children):
"""
maps from (sub)tree type (i.e. Nucleus or Satellite) to a list
of all children of this type
"""
child_types = defaultdict(list)
for i, child in enumerate(children):
child_types[self.get_tree_type(child)].append(i)
return child_types
def get_edu_text(self, text_subtree):
assert text_subtree[0].value() == 'text'
return u' '.join(word.value().decode('utf-8')
if isinstance(word, sexpdata.Symbol) else word.decode('utf-8')
for word in text_subtree[1:])
def get_tree_type(self, tree):
"""returns the type of the (sub)tree: Root, Nucleus or Satellite"""
tree_type = tree[0].value()
return tree_type
def get_node_type(self, tree):
"""returns the node type (leaf or span) of a subtree (i.e. Nucleus or Satellite)"""
node_type = tree[1][0].value()
assert node_type in ('leaf', 'span')
return node_type
def get_relation_type(self, tree):
"""returns the RST relation type attached to the parent node of an RST relation"""
return tree[2][1].value()
def get_node_id(self, nuc_or_sat):
node_type = self.get_node_type(nuc_or_sat)
if node_type == 'leaf':
leaf_id = nuc_or_sat[1][1]
return '{}:{}'.format(self.ns, leaf_id)
else: # node_type == 'span'
span_start = nuc_or_sat[1][1]
span_end = nuc_or_sat[1][2]
return '{}:span:{}-{}'.format(self.ns, span_start, span_end)
In [15]:
RSTDT_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-tokenized')
import traceback
# for folder in ('TEST', 'TRAINING'):
# for rst_fpath in glob.glob(os.path.join(RSTDT_TOKENIZED_ROOT, folder, '*.dis')):
# try:
# RSTLispDocumentGraph(rst_fpath)
# # print rst_fpath
# except ValueError as e:
# sys.stderr.write("Error in file '{}'\n{}\n".format(rst_fpath, e))
In [16]:
# TODO: error in attachment: rst:span:18-20 -> 18-19
rdg = RSTLispDocumentGraph(RSTDT_TOKENIZED_TEST_FILE, tokenize=False)
In [17]:
# %load_ext gvmagic
# %dotstr dg.print_dot(rdg)
In [18]:
RSTDT_NLTK_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-nltk-tokenized')
dis_file = os.path.join(RSTDT_NLTK_TOKENIZED_ROOT, 'TEST/wsj_2386.out.dis')
mrg_file = os.path.join(PTB_WSJ_ROOT_DIR, '23/wsj_2386.mrg')
rdg = RSTLispDocumentGraph(dis_file)
pdg = dg.read_ptb(mrg_file)
for t in rdg.tokens[:10]: print t,
print
for t in pdg.tokens[:10]: print t,
In [19]:
print dis_file
rdg.merge_graphs(pdg, verbose=True)
In [20]:
import re
import glob
import sys
WSJ_SUBDIR_REGEX = re.compile('wsj_(\d{2})')
WSJ_DOCID_REGEX = re.compile('wsj_(\d{4})')
for folder in ('TEST', 'TRAINING'):
for rst_fpath in glob.glob(os.path.join(RSTDT_NLTK_TOKENIZED_ROOT, folder, '*.dis')):
doc_id = os.path.basename(rst_fpath).split('.')[0]
try:
rdg = RSTLispDocumentGraph(rst_fpath)
rst_fname = os.path.basename(rst_fpath).lower()
doc_id = WSJ_DOCID_REGEX.match(rst_fname).groups()[0]
wsj_subdir = WSJ_SUBDIR_REGEX.match(rst_fname).groups()[0]
ptb_file = os.path.join(PTB_WSJ_ROOT_DIR, wsj_subdir, 'wsj_{}.mrg'.format(doc_id))
pdg = dg.read_ptb(ptb_file)
try:
rdg.merge_graphs(pdg)
print "merged: {}\n".format(rst_fpath)
except Exception as e:
sys.stderr.write("Error in {}: {}\n".format(rst_fpath, e))
except Exception as e:
sys.stderr.write("Error in {}: {}\n".format(rst_fpath, e))
In [20]:
In [21]:
os.path.basename(RSTDT_TEST_FILE)
Out[21]:
In [22]:
PTB_TEST_FILE = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1306.mrg')
In [23]:
sent0_root = pdg.sentences[0]
ptb_1306_tokens = list(pdg.get_tokens(token_strings_only=True))
In [10]:
RSTDT_TEST_FILE
Out[10]:
In [9]:
rst_tree = parse_rstfile_nltk(RSTDT_TEST_FILE)
span_tree = rst_tree[0]
print span_tree, span_tree.productions(), span_tree.leaves()
In [21]:
print rst_tree[1][1]
In [16]:
# print open(RSTDT_TEST_FILE).read()
In [25]: