In [93]:
import os
from lxml import etree
from nltk.tree import ParentedTree
from rstviewer import embed_rs3_image, embed_rs3str_image
import discoursegraphs as dg
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes
from discoursegraphs import t
import IPython
In [94]:
# !ls *.rs3
In [95]:
# embed_rs3_image("foo-bar-only-segments.rs3")
In [96]:
# as the leaves are unconnected, we need to add a root node and edges
# maybe we should always connect unconnected nodes into a linear-sequence multinuc
t("linear-sequence", [
("N", "foo"),
("N", "bar")])
Out[96]:
In [97]:
# embed_rs3_image("foo-bar-circ-foo-to-bar.rs3")
In [98]:
t("circumstance", [
("S", "foo"),
("N", "bar")])
Out[98]:
In [99]:
# embed_rs3_image("foo-bar-elab-foo-to-bar.rs3")
In [100]:
t("elaboration", [
("S", "foo"),
("N", "bar")])
Out[100]:
In [101]:
# embed_rs3_image("foo-bar-circ-bar-to-foo.rs3")
In [102]:
t("circumstance", [
("N", "foo"),
("S", "bar")])
Out[102]:
In [103]:
# embed_rs3_image("foo-bar-elab-bar-to-foo.rs3")
In [104]:
t("elaboration", [
("N", "foo"),
("S", "bar")])
Out[104]:
In [105]:
# embed_rs3_image("foo-bar-foo-conj-bar.rs3")
In [106]:
t("conjunction", [
("N", "foo"),
("N", "bar")])
Out[106]:
In [107]:
# embed_rs3_image("foo-bar-foo-joint-bar.rs3")
In [108]:
t("joint", [
("N", "foo"),
("N", "bar")])
Out[108]:
In [109]:
# trees_with_three_elems = !ls eins*.rs3
# for rs3_file in trees_with_three_elems:
# print "embed_rs3_image('{}')".format(rs3_file)
In [110]:
# embed_rs3_image('eins-zwei-drei-only-segments.rs3')
In [111]:
# as the leaves are unconnected, we need to add a root node and edges
# maybe we should always connect unconnected nodes into a linear-sequence multinuc
t("linear-sequence", [("N", "eins"), ("N", "zwei"), ("N", "drei")])
Out[111]:
In [112]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-from-zwei)-to-drei).rs3')
In [113]:
t("circumstance", [
("S", [
("circumstance", [
("N", "eins"),
("S", "zwei")])]),
("N", "drei")])
Out[113]:
In [114]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-from-zwei)-from-drei).rs3')
In [115]:
t("circumstance", [
("N", [
("circumstance", [
("N", "eins"),
("S", "zwei")])]),
("S", "drei")])
Out[115]:
In [116]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-to-zwei)-to-drei.rs3')
In [117]:
t("circumstance", [
("S", [
("circumstance", [
("S", "eins"),
("N", "zwei")])]),
("N", "drei")])
Out[117]:
In [118]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-to-zwei)-from-drei).rs3')
In [119]:
t("circumstance", [
("N", [
("circumstance", [
("S", "eins"),
("N", "zwei")])]),
("S", "drei")])
Out[119]:
In [120]:
# embed_rs3_image('eins-zwei-drei-(circ-eins-to-(joint-zwei-and-drei).rs3')
In [121]:
t("circumstance", [
("S", "eins"),
("N", [
("joint", [
("N", "zwei"),
("N", "drei")])])])
Out[121]:
In [122]:
# embed_rs3_image('eins-zwei-drei-(circ-eins-from-(joint-zwei-and-drei).rs3')
In [123]:
t("circumstance", [
("N", "eins"),
("S", [
("joint", [
("N", "zwei"),
("N", "drei")])])])
Out[123]:
In [124]:
# embed_rs3_image('eins-zwei-drei-(circ-(joint-eins-and-zwei)-to-drei).rs3')
In [125]:
t("circumstance", [
("S", [
("joint", [
("N", "eins"),
("N", "zwei")
])
]),
("N", "drei")
])
Out[125]:
In [126]:
# embed_rs3_image('eins-zwei-drei-(circ-(joint-eins-and-zwei)-from-drei).rs3')
In [127]:
t("circumstance", [
("N", [
("joint", [
("N", "eins"),
("N", "zwei")
])
]),
("S", "drei")
])
Out[127]:
In [128]:
# embed_rs3_image('eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3')
In [129]:
t("elaboration", [
("N", [
("joint", [
("N", "eins"),
("N", "zwei")
])
]),
("S", "drei")
])
Out[129]:
In [130]:
# embed_rs3_image('eins-zwei-drei-(joint-eins-and-zwei-and-drei).rs3')
In [131]:
t("joint", [("N", "eins"), ("N", "zwei"), ("N", "drei")])
Out[131]:
In [ ]:
In [132]:
example_tree = t("elaboration", [
("N", [
("joint", [
("N", "eins"),
("N", "zwei")
])
]),
("S", "drei")
])
In [133]:
example_tree.leaves()
Out[133]:
In [134]:
example_tree.pretty_print()
In [135]:
example_rs3 = 'eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3'
In [136]:
# embed_rs3_image(example_rs3)
In [137]:
t("elaboration", [
("N", [
("joint", [
("N", "eins"),
("N", "zwei")
])
]),
("S", "drei")
])
Out[137]:
In [138]:
rdg = dg.read_rs3(example_rs3)
In [139]:
# %load_ext gvmagic
In [140]:
# %dotstr dg.print_dot(rdg)
because the root node is a <segment>
and not a <group>
.
<body>
<segment id="1">eins</segment>
<segment id="2" parent="4" relname="joint"> zwei</segment>
<segment id="3" parent="4" relname="joint"> drei</segment>
<group id="4" type="multinuc" parent="1" relname="elaboration" />
</body>
In [141]:
# rdg_maz00001 = dg.corpora.pcc.get_document('maz-00001')
In [142]:
# dg.corpora.pcc.get_files_by_document_id('maz-00001')
In [143]:
# maz00001_rs3 = "/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-00001.rs3"
# rdg_maz00001 = dg.read_rs3(maz00001_rs3, tokenize=False)
In [144]:
# %dotstr dg.print_dot(rdg_maz00001)
In [145]:
# 'eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3'
rs3str1 = """<rst>
<header>
<relations>
<rel name="elaboration" type="rst" />
<rel name="joint" type="multinuc" />
</relations>
</header>
<body>
<segment id="1">eins</segment>
<segment id="2" parent="4" relname="joint"> zwei</segment>
<segment id="3" parent="4" relname="joint"> drei</segment>
<group id="4" type="multinuc" parent="1" relname="elaboration" />
</body>
</rst>"""
In [146]:
# embed_rs3str_image(rs3str1)
In [147]:
rs3etree1 = etree.fromstring(rs3str1)
extract_relationtypes(rs3etree1)
Out[147]:
In [148]:
from collections import defaultdict
def rs3_to_dicts(rs3_file):
rs3_etree = etree.parse(rs3_file)
reltypes = extract_relationtypes(rs3_etree)
elements = defaultdict(lambda : defaultdict(str))
children = defaultdict(list)
for elem in rs3_etree.iter('segment'):
segment_id = elem.attrib['id']
parent_id = elem.attrib.get('parent')
elements[segment_id]['parent'] = parent_id
relname = elem.attrib.get('relname')
elements[segment_id]['relname'] = relname
if relname == None:
elements[segment_id]['segment_type'] = 'isolate'
elif relname == 'span':
elements[segment_id]['segment_type'] = 'nucleus'
else:
if reltypes[relname] == 'rst':
elements[segment_id]['segment_type'] = 'satellite'
else: #reltypes[relname] == 'multinuc'
elements[segment_id]['segment_type'] = 'multinuc'
elements[segment_id]['text'] = elem.text
elements[segment_id]['element_type'] = 'segment'
children[parent_id].append(segment_id)
for elem in rs3_etree.iter('group'):
group_id = elem.attrib['id']
parent_id = elem.attrib.get('parent')
elements[group_id]['parent'] = parent_id
elements[group_id]['relname'] = elem.attrib.get('relname')
elements[group_id]['group_type'] = elem.attrib.get('type')
elements[group_id]['element_type'] = 'group'
children[parent_id].append(group_id)
return children, elements
In [149]:
def children_dict2tree(child_dict, elements_dict, start_node=None):
children = child_dict[start_node]
if len(children) == 0:
edu_text = elements_dict[start_node]['text']
formatted_edu_text = '\n'.join(edu_text.split())
return t(start_node, formatted_edu_text)
if len(children) == 1:
child = children[0]
return t(child, children_dict2tree(child_dict, elements_dict, start_node=child))
else:
child_trees = [children_dict2tree(child_dict, elements_dict, start_node=child)
for child in children]
if start_node is None:
return t("root", child_trees)
else:
return child_trees
In [150]:
# children_dict2tree(children, elements)
In [151]:
smallest_pcc_rs3_file = sorted((os.path.getsize(rs3_file), rs3_file)
for rs3_file in dg.corpora.pcc.get_files_by_layer('rst'))[0][1]
In [152]:
child_dict, elements_dict = rs3_to_dicts(smallest_pcc_rs3_file)
children_dict2tree(child_dict, elements_dict)
Out[152]:
In [153]:
embed_rs3_image(smallest_pcc_rs3_file)
In [ ]:
In [154]:
# smallest_pcc_rs3_file
In [155]:
child_dict
Out[155]:
In [156]:
elements_dict['1'] # is a segment -> no children
Out[156]:
In [157]:
elements_dict['20'] # is a span
Out[157]:
In [158]:
child_dict['20']
Out[158]:
In [159]:
elements_dict['19'] # is a span
Out[159]:
In [160]:
child_dict['19'] # has two children
Out[160]:
In [161]:
# work directly on children that are not spans. look at span's children (recursion?)
for child in child_dict['19']:
print child
print elements_dict[child], "\n"
child_relation = elements_dict[child]['relname']
if child_relation != 'span':
print t(child_relation, [(child, []), ('19', [])])
In [162]:
t('antithesis', [
('15', []),
('19', [])
])
Out[162]:
In [163]:
child_dict['18'] # 18 is a span, so we have to look at its children
Out[163]:
In [164]:
elements_dict['6'] # is a segment (-> no children);
Out[164]:
In [165]:
elements_dict['9'] # is a segment (-> no children);
Out[165]:
In [ ]:
In [166]:
t("root", [
("N (1)", "Naive PDS"),
("N (20)", [
("antithesis", [
("S (15)", [
("preparation", [
("S (11)", "Situation"),
("N (14)", [
("joint", [
("N (2)", "niemand"),
("N (13)", [
("elaboration", [
("N (3)", "Maßnahme"),
("S (12)", [
("concession", [
("S (4)", "Basis"),
("N (5)", "Ernst")
])
])
])
])
])
])
])
]),
("N (19)", [
("evaluation-s", [
("N (18)", [
("evidence", [
("N (6)", "Tagen"),
("S (17)", [
("condition", [
("S (7)", "P-Antrag"),
("N (16)", [
("disjunction", [
("N (8)", "grenzenlos"),
("N (10)", "oder nimmt")
])
])
])
])
])
]),
("S (9)", "abzuwarten")
])
])
])
])
])
Out[166]:
In [167]:
elements_dict['10']
Out[167]:
In [168]:
elements_dict['16']
Out[168]:
In [169]:
child_dict['16']
Out[169]:
In [170]:
def get_edu_label(element):
assert element['element_type'] == 'segment'
if element['segment_type'] in ('nucleus', 'multinuc'):
return 'N'
else:
return 'S'
def rt(rs3_file, start_node=None):
child_dict, elements_dict = rs3_to_dicts(rs3_file)
element = elements_dict[start_node]
children = child_dict[start_node]
if len(children) == 0:
node_label = get_edu_label(element)
return t(node_label, element['text'])
else: #len(children) > 0
assert element['element_type'] == 'group'
if element['group_type'] == 'multinuc':
multinuc_children = [c for c in child_dict[start_node] if elements_dict[c].get('segment_type') == 'multinuc']
multinuc_subtree = [("N", elements_dict[c]['text']) for c in multinuc_children]
return t('multinuc', multinuc_subtree)
else: #element['group_type'] == 'span'
raise NotImplementedError("element: {}".format(element))
In [171]:
rt(smallest_pcc_rs3_file, start_node='16')
Out[171]:
In [172]:
child_dict['10']
Out[172]:
In [173]:
elements_dict['16']
Out[173]:
In [174]:
child_dict['16']
Out[174]:
In [175]:
elements_dict['7']
Out[175]:
In [176]:
elements_dict['8']
Out[176]:
In [177]:
elements_dict['10']
Out[177]:
In [178]:
e = '16'
[c for c in child_dict[e] if elements_dict[c].get('segment_type') == 'multinuc']
Out[178]:
In [179]:
z = t("foo", "bar")
In [180]:
z.__repr__()
Out[180]:
In [181]:
z.pretty_print()
In [182]:
def print_segments(rs3_file):
rs3_etree = etree.parse(rs3_file)
reltypes = extract_relationtypes(rs3_etree)
for elem in rs3_etree.iter('segment'):
segment_id = elem.attrib['id']
print "Segment ID: ", segment_id
parent_id = elem.attrib.get('parent')
relname = elem.attrib.get('relname')
edu_text = elem.text.strip()
if parent_id is None:
segment_tree = t("N", edu_text)
IPython.core.display.display(segment_tree)
else:
reltype = reltypes.get(relname, 'span')
if reltype == 'rst':
segment_tree = t(relname, [
('S', edu_text),
('N', "... cf. {}".format(parent_id))])
IPython.core.display.display(segment_tree)
elif reltype == 'multinuc':
segment_tree = t(relname, [
('S', edu_text),
('N', "... cf. {}".format(parent_id))])
IPython.core.display.display(segment_tree)
else: #reltype == 'span'
# segment is a nucleus
pass #raise NotImplementedError
In [183]:
print_segments(smallest_pcc_rs3_file)
In [184]:
from IPython.display import display
In [185]:
display(t("foo"))
In [186]:
from nltk.draw.tree import TreeView
In [187]:
print TreeView(t("foo"))
In [188]:
for x in ('a','b','c'):
IPython.core.display.display(t(x))
In [189]:
smallest_pcc_rs3_file
Out[189]:
In [190]:
class RSTTree(object):
def __init__(self, root, children=None):
self.tree = t(root, children)
def _repr_png_(self):
return self.tree._repr_png_()
def __str__(self):
return self.tree.__str__()
In [191]:
r = RSTTree("foo", "bar")
In [192]:
print r
In [193]:
r
Out[193]:
In [194]:
from collections import OrderedDict
In [195]:
e = OrderedDict()
In [196]:
e[1] = 5
In [197]:
e[5] = 10
In [198]:
e[2] = 23
In [199]:
e
Out[199]:
In [200]:
import textwrap
In [201]:
textwrap.wrap("sdfls ;kdfsdfl;s dl;sdf d; fd;f sdfk;dkf d; fks d ;skd", 5)
Out[201]:
In [202]:
class NoRootError(ValueError):
pass
In [203]:
raise NoRootError("Foo")
In [ ]: