In [1]:
leaves = ['_!blah', 'blah', 'blah_!']
In [2]:
leaves[0] = leaves[0].lstrip('_!')
leaves[-1] = leaves[-1].rstrip('_!')
In [3]:
leaves
Out[3]:
In [4]:
import os
import discoursegraphs as dg
from discoursegraphs.readwrite.tree import t, is_leaf
In [5]:
# !ls *.rs3
In [6]:
t1 = dg.read_rs3tree('foo-bar-only-segments.rs3')
t1
Out[6]:
In [7]:
t1.pprint()
In [ ]:
In [8]:
t2 = dg.read_rs3tree('eins-zwei-drei-(circ-(circ-eins-from-zwei)-from-drei).rs3')
t2
Out[8]:
In [9]:
t2.pprint()
In [10]:
t3_filepath = os.path.join(dg.DATA_ROOT_DIR, 'rst-example1.dis')
t3 = dg.read_distree(t3_filepath)
t3
Out[10]:
In [11]:
t3.disfile_tree
Out[11]:
In [12]:
t3.disfile_tree.label()
Out[12]:
In [13]:
t3.disfile_tree[0]
Out[13]:
In [14]:
t3.disfile_tree[1]
Out[14]:
In [15]:
t3.disfile_tree[2]
Out[15]:
In [16]:
t3.disfile_tree.parent()
In [17]:
type(t3)
Out[17]:
In [18]:
type(t3.disfile_tree)
Out[18]:
In [19]:
from nltk.tree import Tree
In [20]:
x1 = Tree('foo', [])
x1
Out[20]:
In [21]:
def isroot(parented_tree):
return hasattr(parented_tree, 'parent') and parented_tree.parent() is None
In [22]:
t3
Out[22]:
In [23]:
isroot(t('foo', ['bar'])[0])
Out[23]:
In [24]:
for i, treepos in enumerate(t3.tree.treepositions('leaves'), 1):
print(i, treepos)
In [25]:
t3.tree[1]
Out[25]:
In [26]:
t3.tree[1].treepositions('leaves')
Out[26]:
In [27]:
t3.tree[1].root().treepositions('leaves')
Out[27]:
In [28]:
t3.tree[(1,0,0)]
Out[28]:
In [29]:
x = t3.tree[(1,0,0)]
x.treeposition()
Out[29]:
In [30]:
x.treepositions('leaves')
Out[30]:
In [31]:
(1,0,0) + (0,1,0)
Out[31]:
In [32]:
# for rel_leaf_pos in x.treepositions('leaves'):
# print "rel pos: {}, abs pos: {}".format(rel_leaf_pos, x[rel_leaf_pos].treeposition())
In [33]:
all_leaves = t3.tree[1].root().treepositions('leaves')
subtree_leaves = t3.tree[1].treepositions('leaves')
In [34]:
subtree_leaves
Out[34]:
In [35]:
all_leaves
Out[35]:
In [ ]:
In [36]:
def subtree_leaf_positions(subtree):
"""Return tree positions of all leaves of a subtree."""
relative_leaf_positions = subtree.treepositions('leaves')
subtree_root_pos = subtree.treeposition()
absolute_leaf_positions = []
for rel_leaf_pos in relative_leaf_positions:
absolute_leaf_positions.append( subtree_root_pos + rel_leaf_pos)
return absolute_leaf_positions
def all_leaf_positions(parented_tree):
"""Return tree positions of all leaves of a ParentedTree,
even if the input is only a subtree of that ParentedTree.
"""
return parented_tree.root().treepositions('leaves')
In [37]:
def make_span(parented_tree):
"""create a 'span' or 'leaf' subtree for dis/lisp/RST-DT-formatted trees.
Examples:
span (a subtree that covers the leaves 1 to 7)
___|____
1 7
leaf (a subtree that only covers leaf 7)
|
7
"""
all_leaves = all_leaf_positions(parented_tree)
if isroot(parented_tree):
return t('span', ['1', str(len(all_leaves))])
subtree_leaves = subtree_leaf_positions(parented_tree)
if len(subtree_leaves) == 1:
edu_id = all_leaves.index(subtree_leaves[0]) + 1
return t('leaf', [str(edu_id)])
elif len(subtree_leaves) > 1:
first_edu_id = all_leaves.index(subtree_leaves[0]) + 1
last_edu_id = all_leaves.index(subtree_leaves[-1]) + 1
return t('span', [str(first_edu_id), str(last_edu_id)])
else:
raise NotImplementedError('Subtree has no leaves')
In [38]:
make_span(t3.tree).pretty_print()
In [39]:
make_span(t3.tree[1][0][1]).pretty_print()
In [40]:
make_span(t3.tree[1][0])
Out[40]:
In [41]:
t3.tree[1][0]
Out[41]:
In [42]:
left = t3.tree[(0,0,0)]
print(left.treeposition())
left
Out[42]:
In [43]:
right = t3.tree[(0,0,1)]
print(right.treeposition())
right
Out[43]:
In [44]:
right.treeposition()
Out[44]:
In [45]:
left.treeposition()
Out[45]:
In [46]:
left.root()[right.treeposition()]
Out[46]:
In [47]:
for child in t3.tree:
print child.treeposition()
In [48]:
def get_siblings(parented_subtree):
subtree_pos = parented_subtree.treeposition()
parent = parented_subtree.parent()
if parent is None:
return []
siblings = []
for child in parent:
child_pos = child.treeposition()
if child_pos != subtree_pos:
siblings.append(child_pos)
return siblings
In [49]:
x = t3.tree[0,0,0]
print(get_siblings(x))
print(x)
In [50]:
def make_rel2par(nuc_or_sat_subtree):
if isroot(nuc_or_sat_subtree):
raise ValueError("Root node can't have a relation.")
subtree_root_label = nuc_or_sat_subtree.label()
parent_label = nuc_or_sat_subtree.parent().label()
if subtree_root_label == 'S':
return t('rel2par', [parent_label])
elif subtree_root_label == 'N':
siblings = get_siblings(nuc_or_sat_subtree)
root = nuc_or_sat_subtree.root()
sibling_labels = [root[sib].label() for sib in siblings]
if len(siblings) == 1 and sibling_labels[0] == 'S':
return t('rel2par', ['span'])
elif all([label == 'N' for label in sibling_labels]):
return t('rel2par', [parent_label])
else:
raise ValueError(
"Can't mix sibling types. Expected 'N' or 'S', got: {}".format(sibling_labels))
else:
raise ValueError(
"Unknown nuclearity. Expected 'N' or 'S', got: {}".format(subtree_root_label))
In [51]:
x = t3.tree[(1,0,1)]
x.pretty_print()
make_rel2par(x)
Out[51]:
In [52]:
t3.tree
Out[52]:
In [53]:
def make_edu(edu_string):
tokens = edu_string.split()
tokens[0] = u'_!' + tokens[0]
tokens[-1] = tokens[-1] + u'_!'
return t('text', tokens)
In [54]:
def get_nucsat_subtrees(parented_tree):
"""Return all direct children of the given tree, that are either
a nucleus, satellite or a leaf node (i.e. all children except
for relation nodes.)
"""
if is_leaf(parented_tree):
return [parented_tree]
nucsat_children = []
for child in parented_tree:
if is_leaf(child) or child.label() in ('N', 'S'):
nucsat_children.append(child)
else:
nucsat_children.extend( get_nucsat_subtrees(child) )
return nucsat_children
In [55]:
from IPython.display import Image, display
display(t3.tree[(0)])
for child in get_nucsat_subtrees(t3.tree[(0)]):
display(child)
In [56]:
# def get_unparented_subtrees(parented_tree):
# unparented_subtrees = []
# for subtree in parented_tree:
# subtree._parent = None
# unparented_subtrees.append(subtree)
# return unparented_subtrees
def orphanize(parented_subtree):
if is_leaf(parented_subtree):
return parented_subtree
else:
parented_subtree._parent = None
return parented_subtree
def convert_label(label):
if label == 'N':
return 'Nucleus'
elif label == 'S':
return 'Satellite'
else:
return label
def convert(parented_tree):
if isroot(parented_tree):
span_description = make_span(parented_tree)
children = [span_description]
for subtree in get_nucsat_subtrees(parented_tree):
children.append(convert(subtree))
orphaned_children = [orphanize(child) for child in children]
return t('Root', orphaned_children)
elif is_leaf(parented_tree):
return make_edu(parented_tree)
else:
span_description = make_span(parented_tree)
rel_description = make_rel2par(parented_tree)
children = [span_description, rel_description]
for subtree in get_nucsat_subtrees(parented_tree):
children.append(convert(subtree))
tree_label = convert_label(parented_tree.label())
orphaned_children = [orphanize(child) for child in children]
return t(tree_label, orphaned_children)
In [57]:
convert(t3.tree)
Out[57]:
In [58]:
test1 = t3.tree[(0,0,0,0,0,0)]
test1
Out[58]:
In [59]:
convert(test1) # OKAY
Out[59]:
In [60]:
test2 = t3.tree[(0,0,0,0,0)]
test2
Out[60]:
In [61]:
convert(test2) # OKAY
Out[61]:
In [62]:
test3 = t3.tree[(0,0,0,0,1,0)]
test3
Out[62]:
In [63]:
# convert(test3) # FAIL: ValueError: Unknown nuclearity. Expected 'N' or 'S', got: consequence-n
In [64]:
test4 = t3.tree[(0,0,0,0,1)]
test4
Out[64]:
In [65]:
convert(test4) # OKAY
Out[65]:
In [66]:
test5 = t3.tree[(0,0,0)]
test5
Out[66]:
In [67]:
ctest5 = convert(test5)
ctest5
Out[67]:
In [68]:
print convert(t3.tree)
In [69]:
input_tree = dg.read_distree(os.path.join(dg.DATA_ROOT_DIR, 'rst-example1.dis'))
In [70]:
input_tree.disfile_tree
Out[70]:
In [71]:
input_tree
Out[71]:
In [72]:
import glob
In [73]:
RSTDT_DIR = '/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/'
dis_files = glob.glob(os.path.join(RSTDT_DIR, '*.dis'))
In [74]:
for dis_file in dis_files:
try:
dg.read_distree(dis_file)
except Exception as e:
print(os.path.basename(dis_file), e)
In [75]:
wsj_1115 = dg.read_distree(os.path.join(RSTDT_DIR, 'wsj_1115.out.dis'))
wsj_1115
Out[75]:
In [76]:
dg.write_dis(wsj_1115)
In [ ]: