In [93]:
import os
from lxml import etree
from nltk.tree import ParentedTree
from rstviewer import embed_rs3_image, embed_rs3str_image
import discoursegraphs as dg
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes
from discoursegraphs import t

import IPython

In [94]:
# !ls *.rs3

In [95]:
# embed_rs3_image("foo-bar-only-segments.rs3")

In [96]:
# as the leaves are unconnected, we need to add a root node and edges
# maybe we should always connect unconnected nodes into a linear-sequence multinuc
t("linear-sequence", [
    ("N", "foo"),
    ("N", "bar")])


Out[96]:

In [97]:
# embed_rs3_image("foo-bar-circ-foo-to-bar.rs3")

In [98]:
t("circumstance", [
    ("S", "foo"),
    ("N", "bar")])


Out[98]:

In [99]:
# embed_rs3_image("foo-bar-elab-foo-to-bar.rs3")

In [100]:
t("elaboration", [
    ("S", "foo"),
    ("N", "bar")])


Out[100]:

In [101]:
# embed_rs3_image("foo-bar-circ-bar-to-foo.rs3")

In [102]:
t("circumstance", [
    ("N", "foo"),
    ("S", "bar")])


Out[102]:

In [103]:
# embed_rs3_image("foo-bar-elab-bar-to-foo.rs3")

In [104]:
t("elaboration", [
    ("N", "foo"),
    ("S", "bar")])


Out[104]:

In [105]:
# embed_rs3_image("foo-bar-foo-conj-bar.rs3")

In [106]:
t("conjunction", [
    ("N", "foo"),
    ("N", "bar")])


Out[106]:

In [107]:
# embed_rs3_image("foo-bar-foo-joint-bar.rs3")

In [108]:
t("joint", [
    ("N", "foo"),
    ("N", "bar")])


Out[108]:

In [109]:
# trees_with_three_elems = !ls eins*.rs3
# for rs3_file in trees_with_three_elems:
#     print "embed_rs3_image('{}')".format(rs3_file)

In [110]:
# embed_rs3_image('eins-zwei-drei-only-segments.rs3')

In [111]:
# as the leaves are unconnected, we need to add a root node and edges
# maybe we should always connect unconnected nodes into a linear-sequence multinuc
t("linear-sequence", [("N", "eins"), ("N", "zwei"), ("N", "drei")])


Out[111]:

In [112]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-from-zwei)-to-drei).rs3')

In [113]:
t("circumstance", [
    ("S", [
        ("circumstance", [
            ("N", "eins"),
            ("S", "zwei")])]),
    ("N", "drei")])


Out[113]:

In [114]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-from-zwei)-from-drei).rs3')

In [115]:
t("circumstance", [
    ("N", [
        ("circumstance", [
            ("N", "eins"),
            ("S", "zwei")])]),
    ("S", "drei")])


Out[115]:

In [116]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-to-zwei)-to-drei.rs3')

In [117]:
t("circumstance", [
    ("S", [
        ("circumstance", [
            ("S", "eins"),
            ("N", "zwei")])]),
    ("N", "drei")])


Out[117]:

In [118]:
# embed_rs3_image('eins-zwei-drei-(circ-(circ-eins-to-zwei)-from-drei).rs3')

In [119]:
t("circumstance", [
    ("N", [
        ("circumstance", [
            ("S", "eins"),
            ("N", "zwei")])]),
    ("S", "drei")])


Out[119]:

In [120]:
# embed_rs3_image('eins-zwei-drei-(circ-eins-to-(joint-zwei-and-drei).rs3')

In [121]:
t("circumstance", [
    ("S", "eins"),
    ("N", [
        ("joint", [
            ("N", "zwei"),
            ("N", "drei")])])])


Out[121]:

In [122]:
# embed_rs3_image('eins-zwei-drei-(circ-eins-from-(joint-zwei-and-drei).rs3')

In [123]:
t("circumstance", [
    ("N", "eins"),
    ("S", [
        ("joint", [
            ("N", "zwei"),
            ("N", "drei")])])])


Out[123]:

In [124]:
# embed_rs3_image('eins-zwei-drei-(circ-(joint-eins-and-zwei)-to-drei).rs3')

In [125]:
t("circumstance", [
    ("S", [
        ("joint", [
            ("N", "eins"),
            ("N", "zwei")
        ])
    ]),
    ("N", "drei")
])


Out[125]:

In [126]:
# embed_rs3_image('eins-zwei-drei-(circ-(joint-eins-and-zwei)-from-drei).rs3')

In [127]:
t("circumstance", [
    ("N", [
        ("joint", [
            ("N", "eins"),
            ("N", "zwei")
        ])
    ]),
    ("S", "drei")
])


Out[127]:

In [128]:
# embed_rs3_image('eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3')

In [129]:
t("elaboration", [
    ("N", [
        ("joint", [
            ("N", "eins"),
            ("N", "zwei")
        ])
    ]),
    ("S", "drei")
])


Out[129]:

In [130]:
# embed_rs3_image('eins-zwei-drei-(joint-eins-and-zwei-and-drei).rs3')

In [131]:
t("joint", [("N", "eins"), ("N", "zwei"), ("N", "drei")])


Out[131]:

In [ ]:


In [132]:
example_tree = t("elaboration", [
    ("N", [
        ("joint", [
            ("N", "eins"),
            ("N", "zwei")
        ])
    ]),
    ("S", "drei")
])

In [133]:
example_tree.leaves()


Out[133]:
['eins', 'zwei', 'drei']

In [134]:
example_tree.pretty_print()


           elaboration     
        ________|_______    
       N                |  
       |                |   
     joint              |  
  _____|________        |   
 N              N       S  
 |              |       |   
eins           zwei    drei

TODO: does dg.readwrite.tree work with RSTGraph?


In [135]:
example_rs3 = 'eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3'

In [136]:
# embed_rs3_image(example_rs3)

In [137]:
t("elaboration", [
    ("N", [
        ("joint", [
            ("N", "eins"),
            ("N", "zwei")
        ])
    ]),
    ("S", "drei")
])


Out[137]:

In [138]:
rdg = dg.read_rs3(example_rs3)

In [139]:
# %load_ext gvmagic

In [140]:
# %dotstr dg.print_dot(rdg)

TODO: why is root node not connected?

because the root node is a <segment> and not a <group>.

<body>
    <segment id="1">eins</segment>
    <segment id="2" parent="4" relname="joint"> zwei</segment>
    <segment id="3" parent="4" relname="joint"> drei</segment>
    <group id="4" type="multinuc" parent="1" relname="elaboration" />
</body>

In [141]:
# rdg_maz00001 = dg.corpora.pcc.get_document('maz-00001')

In [142]:
# dg.corpora.pcc.get_files_by_document_id('maz-00001')

In [143]:
# maz00001_rs3 = "/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-00001.rs3"
# rdg_maz00001 = dg.read_rs3(maz00001_rs3, tokenize=False)

In [144]:
# %dotstr dg.print_dot(rdg_maz00001)

In [145]:
# 'eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3'

rs3str1 = """<rst>
  <header>
    <relations>
      <rel name="elaboration" type="rst" />
      <rel name="joint" type="multinuc" />
    </relations>
  </header>
  <body>
    <segment id="1">eins</segment>
    <segment id="2" parent="4" relname="joint"> zwei</segment>
    <segment id="3" parent="4" relname="joint"> drei</segment>
    <group id="4" type="multinuc" parent="1" relname="elaboration" />
  </body>
</rst>"""

In [146]:
# embed_rs3str_image(rs3str1)

In [147]:
rs3etree1 = etree.fromstring(rs3str1)
extract_relationtypes(rs3etree1)


Out[147]:
{'elaboration': 'rst', 'joint': 'multinuc'}

In [148]:
from collections import defaultdict

def rs3_to_dicts(rs3_file):
    rs3_etree = etree.parse(rs3_file)
    reltypes = extract_relationtypes(rs3_etree)
    elements = defaultdict(lambda : defaultdict(str))
    children = defaultdict(list)

    for elem in rs3_etree.iter('segment'):
        segment_id = elem.attrib['id']
        parent_id = elem.attrib.get('parent')
        elements[segment_id]['parent'] = parent_id
        
        relname = elem.attrib.get('relname')
        elements[segment_id]['relname'] = relname
        if relname == None:
            elements[segment_id]['segment_type'] = 'isolate'
        elif relname == 'span':
            elements[segment_id]['segment_type'] = 'nucleus'
        else:
            if reltypes[relname] == 'rst':
                elements[segment_id]['segment_type'] = 'satellite'
            else:  #reltypes[relname] == 'multinuc'
                elements[segment_id]['segment_type'] = 'multinuc'
                    
        elements[segment_id]['text'] = elem.text
        elements[segment_id]['element_type'] = 'segment'
        children[parent_id].append(segment_id)

    for elem in rs3_etree.iter('group'):
        group_id = elem.attrib['id']
        parent_id = elem.attrib.get('parent')
        elements[group_id]['parent'] = parent_id
        elements[group_id]['relname'] = elem.attrib.get('relname')
        elements[group_id]['group_type'] = elem.attrib.get('type')
        elements[group_id]['element_type'] = 'group'
        children[parent_id].append(group_id)
    
    return children, elements

In [149]:
def children_dict2tree(child_dict, elements_dict, start_node=None):
    children = child_dict[start_node]

    if len(children) == 0:
        edu_text = elements_dict[start_node]['text']
        formatted_edu_text = '\n'.join(edu_text.split())
        return t(start_node, formatted_edu_text)
    if len(children) == 1:
        child = children[0]
        return t(child, children_dict2tree(child_dict, elements_dict, start_node=child))
    else:
        child_trees = [children_dict2tree(child_dict, elements_dict, start_node=child)
                       for child in children]
        if start_node is None:
            return t("root", child_trees)
        else:
            return child_trees

In [150]:
# children_dict2tree(children, elements)

TODO: study the smallest PCC RS3 file that has these errors


In [151]:
smallest_pcc_rs3_file = sorted((os.path.getsize(rs3_file), rs3_file)
                               for rs3_file in dg.corpora.pcc.get_files_by_layer('rst'))[0][1]

In [152]:
child_dict, elements_dict = rs3_to_dicts(smallest_pcc_rs3_file)
children_dict2tree(child_dict, elements_dict)


Out[152]:

In [153]:
embed_rs3_image(smallest_pcc_rs3_file)



In [ ]:


In [154]:
# smallest_pcc_rs3_file

In [155]:
child_dict


Out[155]:
defaultdict(list,
            {None: ['1', '20'],
             '1': [],
             '10': [],
             '11': [],
             '12': ['5'],
             '13': ['3'],
             '14': ['11', '2', '13'],
             '15': ['14'],
             '16': ['7', '8', '10'],
             '17': ['16'],
             '18': ['6', '9'],
             '19': ['15', '18'],
             '2': [],
             '20': ['19'],
             '3': ['12'],
             '4': [],
             '5': ['4'],
             '6': ['17'],
             '7': [],
             '8': [],
             '9': []})

In [156]:
elements_dict['1'] # is a segment -> no children


Out[156]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': None,
             'relname': None,
             'segment_type': 'isolate',
             'text': '\n\nNaive PDS'})

In [157]:
elements_dict['20'] # is a span


Out[157]:
defaultdict(str,
            {'element_type': 'group',
             'group_type': 'span',
             'parent': None,
             'relname': None})

In [158]:
child_dict['20']


Out[158]:
['19']

In [159]:
elements_dict['19'] # is a span


Out[159]:
defaultdict(str,
            {'element_type': 'group',
             'group_type': 'span',
             'parent': '20',
             'relname': 'span'})

In [160]:
child_dict['19'] # has two children


Out[160]:
['15', '18']

In [161]:
# work directly on children that are not spans. look at span's children (recursion?)
for child in child_dict['19']:
    print child
    print elements_dict[child], "\n"
    
    child_relation = elements_dict[child]['relname']
    if child_relation != 'span':
        print t(child_relation, [(child, []), ('19', [])])


15
defaultdict(<type 'str'>, {'relname': 'antithesis', 'group_type': 'span', 'parent': '19', 'element_type': 'group'}) 

(antithesis (15 ) (19 ))
18
defaultdict(<type 'str'>, {'relname': 'span', 'group_type': 'span', 'parent': '19', 'element_type': 'group'}) 


In [162]:
t('antithesis', [
    ('15', []),
    ('19', [])
])


Out[162]:

In [163]:
child_dict['18'] # 18 is a span, so we have to look at its children


Out[163]:
['6', '9']

In [164]:
elements_dict['6'] # is a segment (-> no children);


Out[164]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '18',
             'relname': 'span',
             'segment_type': 'nucleus',
             'text': '  Die PDS stellt in diesen Tagen hingegen erneut unter Beweis , dass sie noch immer nicht im vereinten Deutschland angekommen ist .'})

In [165]:
elements_dict['9'] # is a segment (-> no children);


Out[165]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '18',
             'relname': 'evaluation-s',
             'segment_type': 'satellite',
             'text': u'  Es bleibt abzuwarten , ob das realit\xe4tsferne Herumlavieren der Partei bereits bei der Berlin-Wahl am 21. Oktober Folgen haben wird .'})

In [ ]:


In [166]:
t("root", [
    ("N (1)", "Naive PDS"),
    ("N (20)", [
        ("antithesis", [
            ("S (15)", [
                ("preparation", [
                    ("S (11)", "Situation"),
                    ("N (14)", [
                        ("joint", [
                            ("N (2)", "niemand"),
                            ("N (13)", [
                                ("elaboration", [
                                    ("N (3)", "Maßnahme"),
                                    ("S (12)", [
                                        ("concession", [
                                            ("S (4)", "Basis"),
                                            ("N (5)", "Ernst")
                                        ])
                                    ])
                                ])
                            ])
                        ])
                    ])
                ])
            ]),
            ("N (19)", [
                ("evaluation-s", [
                    ("N (18)", [
                        ("evidence", [
                            ("N (6)", "Tagen"),
                            ("S (17)", [
                                ("condition", [
                                    ("S (7)", "P-Antrag"),
                                    ("N (16)", [
                                        ("disjunction", [
                                            ("N (8)", "grenzenlos"),
                                            ("N (10)", "oder nimmt")
                                        ])
                                    ])
                                ])
                            ])
                        ])
                    ]),
                    ("S (9)", "abzuwarten")
                ])
            ])
        ])
    ])
])


Out[166]:

In [167]:
elements_dict['10']


Out[167]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '16',
             'relname': 'disjunction',
             'segment_type': 'multinuc',
             'text': ' oder nimmt sich selbst nicht ernst .'})

In [168]:
elements_dict['16']


Out[168]:
defaultdict(str,
            {'element_type': 'group',
             'group_type': 'multinuc',
             'parent': '17',
             'relname': 'span'})

In [169]:
child_dict['16']


Out[169]:
['7', '8', '10']

In [170]:
def get_edu_label(element):
    assert element['element_type'] == 'segment'
    if element['segment_type'] in ('nucleus', 'multinuc'):
        return 'N'
    else:
        return 'S'    

def rt(rs3_file, start_node=None):
    child_dict, elements_dict = rs3_to_dicts(rs3_file)
    
    element = elements_dict[start_node]
    children = child_dict[start_node]
    if len(children) == 0:
        node_label = get_edu_label(element)
        return t(node_label, element['text'])
    else:  #len(children) > 0
        assert element['element_type'] == 'group'
        if element['group_type'] == 'multinuc':
            multinuc_children = [c for c in child_dict[start_node] if elements_dict[c].get('segment_type') == 'multinuc']
            multinuc_subtree = [("N", elements_dict[c]['text']) for c in multinuc_children]
            return t('multinuc', multinuc_subtree)

        else:  #element['group_type'] == 'span'
            raise NotImplementedError("element: {}".format(element))

In [171]:
rt(smallest_pcc_rs3_file, start_node='16')


Out[171]:

In [172]:
child_dict['10']


Out[172]:
[]

In [173]:
elements_dict['16']


Out[173]:
defaultdict(str,
            {'element_type': 'group',
             'group_type': 'multinuc',
             'parent': '17',
             'relname': 'span'})

In [174]:
child_dict['16']


Out[174]:
['7', '8', '10']

In [175]:
elements_dict['7']


Out[175]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '16',
             'relname': 'condition',
             'segment_type': 'satellite',
             'text': u'  Wer in einem Parteitagsantrag feststellt , dass der international operierende Terrorismus allein mit " innerem Frieden und gesellschaftlichem Ausgleich " bek\xe4mpft werden kann ,'})

In [176]:
elements_dict['8']


Out[176]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '16',
             'relname': 'disjunction',
             'segment_type': 'multinuc',
             'text': ' ist entweder grenzenlos naiv'})

In [177]:
elements_dict['10']


Out[177]:
defaultdict(str,
            {'element_type': 'segment',
             'parent': '16',
             'relname': 'disjunction',
             'segment_type': 'multinuc',
             'text': ' oder nimmt sich selbst nicht ernst .'})

In [178]:
e = '16'

[c for c in child_dict[e] if elements_dict[c].get('segment_type') == 'multinuc']


Out[178]:
['8', '10']

In [179]:
z = t("foo", "bar")

In [180]:
z.__repr__()


Out[180]:
"ParentedTree('foo', ['bar'])"

In [181]:
z.pretty_print()


foo
 |  
bar


In [182]:
def print_segments(rs3_file):
    rs3_etree = etree.parse(rs3_file)
    reltypes = extract_relationtypes(rs3_etree)

    for elem in rs3_etree.iter('segment'):
        segment_id = elem.attrib['id']
        print "Segment ID: ", segment_id
        parent_id = elem.attrib.get('parent')        
        relname = elem.attrib.get('relname')
        edu_text = elem.text.strip()

        if parent_id is None:
            segment_tree = t("N", edu_text)
            IPython.core.display.display(segment_tree)
        else:
            reltype = reltypes.get(relname, 'span')
            if reltype == 'rst':
                segment_tree = t(relname, [
                    ('S', edu_text),
                    ('N', "... cf. {}".format(parent_id))])
                IPython.core.display.display(segment_tree)
            elif reltype == 'multinuc':
                segment_tree = t(relname, [
                    ('S', edu_text),
                    ('N', "... cf. {}".format(parent_id))])
                IPython.core.display.display(segment_tree)
            else:  #reltype == 'span'
                # segment is a nucleus
                pass  #raise NotImplementedError

In [183]:
print_segments(smallest_pcc_rs3_file)


Segment ID:  1
Segment ID:  11
Segment ID:  2
Segment ID:  3
Segment ID:  4
Segment ID:  5
Segment ID:  6
Segment ID:  7
Segment ID:  8
Segment ID:  10
Segment ID:  9

In [184]:
from IPython.display import display

In [185]:
display(t("foo"))



In [186]:
from nltk.draw.tree import TreeView

In [187]:
print TreeView(t("foo"))


<nltk.draw.tree.TreeView object at 0x7f6ed8266b50>

In [188]:
for x in ('a','b','c'):
    IPython.core.display.display(t(x))



In [189]:
smallest_pcc_rs3_file


Out[189]:
'/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-1818.rs3'

In [190]:
class RSTTree(object):
    def __init__(self, root, children=None):
        self.tree = t(root, children)
    def _repr_png_(self):
        return self.tree._repr_png_()
    def __str__(self):
        return self.tree.__str__()

In [191]:
r = RSTTree("foo", "bar")

In [192]:
print r


(foo bar)

In [193]:
r


Out[193]:

In [194]:
from collections import OrderedDict

In [195]:
e = OrderedDict()

In [196]:
e[1] = 5

In [197]:
e[5] = 10

In [198]:
e[2] = 23

In [199]:
e


Out[199]:
OrderedDict([(1, 5), (5, 10), (2, 23)])

In [200]:
import textwrap

In [201]:
textwrap.wrap("sdfls ;kdfsdfl;s dl;sdf d; fd;f sdfk;dkf d; fks d ;skd", 5)


Out[201]:
['sdfls',
 ';kdfs',
 'dfl;s',
 'dl;sd',
 'f d;',
 'fd;f ',
 'sdfk;',
 'dkf',
 'd;',
 'fks d',
 ';skd']

In [202]:
class NoRootError(ValueError):
    pass

In [203]:
raise NoRootError("Foo")


---------------------------------------------------------------------------
NoRootError                               Traceback (most recent call last)
<ipython-input-203-436a723ec324> in <module>()
----> 1 raise NoRootError("Foo")

NoRootError: Foo

In [ ]: