In [61]:
import os
from lxml import etree
from nltk.tree import ParentedTree
from rstviewer import embed_rs3_image, embed_rs3str_image
import discoursegraphs as dg
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes, RSTTree
from discoursegraphs.readwrite.rst.rs3.rs3tree import n, s, debug_root_label
from discoursegraphs import t

import IPython

In [41]:
PCC_RS3_DIR = os.path.join(dg.DATA_ROOT_DIR, 'potsdam-commentary-corpus-2.0.0', 'rst')
RS3TREE_DIR = os.path.join(dg.DATA_ROOT_DIR, 'rs3tree')
REPO_RS3TREE_DIR = "/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree"

In [42]:
def rstviewer_vs_rsttree(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR,
                         debug=False, word_wrap=0):
    rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
    embed_rs3_image(rs3_filepath)
    rst_tree = RSTTree(rs3_filepath, word_wrap=word_wrap, debug=debug)
    IPython.core.display.display(rst_tree)
    return rst_tree

def print_example(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR):
    rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
    with open(rs3_filepath, 'r') as rs3_file:
        print(rs3_file.read())

TODO: study broken PCC examples


In [43]:
def create_excert_file(pccrs3_input_filename):
    rs3_filepath = os.path.join(PCC_RS3_DIR, pccrs3_input_filename)
    tree = etree.parse(rs3_filepath)

    for i, segment in enumerate(tree.iter('segment'), 1):
        segment.text = str(i)

    doc_id, ext = pccrs3_input_filename.split('.')
    output_filename = "{}-excerpt.{}".format(doc_id, ext)
    output_filepath = os.path.join(REPO_RS3TREE_DIR, output_filename)
    output_str = etree.tostring(tree, pretty_print=True,
                   xml_declaration=True, encoding='UTF-8')

    with open(output_filepath, 'w') as out_file:
        out_file.write(output_str)
        print(output_filepath)

In [98]:
# broken_test_files = ['maz-11279.rs3', 'maz-6918.rs3', 'maz-00001.rs3', 'maz-14654.rs3']
broken_test_files = ['maz-12666.rs3', 'maz-14813.rs3', 'maz-11279.rs3', 'maz-14654.rs3', 'maz-4472.rs3']
for broken_test_file in broken_test_files:
    create_excert_file(broken_test_file)


/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/maz-12666-excerpt.rs3
/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/maz-14813-excerpt.rs3
/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/maz-11279-excerpt.rs3
/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/maz-14654-excerpt.rs3
/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/maz-4472-excerpt.rs3

TODO: write sanity tests for PCC

there must be no span nodes in an RSTTree


In [ ]:
def no_span_nodes(tree, debug=False, root_id=None):
    """Return True, iff there is no span node in the given ParentedTree."""
    assert isinstance(tree, ParentedTree)

    if root_id is None:
        root_id = tree.root_id
    span_label = debug_root_label('span', debug=debug, root_id=root_id)

    if tree.label() == span_label:
        return False

    for node in tree:
        if isinstance(node, ParentedTree) :
            if node.label() == span_label:
                return False
            return no_span_nodes(node, debug=debug, root_id=root_id)
    return True

In [96]:
t('joint', [
        ('N', ['foo']),
        ('N', [
            ('background', [
                ('S', ['bar']),
                ('N', ['baz'])])]),
        ]
    )


Out[96]:

In [ ]:
t('')

In [99]:
produced = rstviewer_vs_rsttree(
    'maz-11279-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=False)

produced = rstviewer_vs_rsttree(
    'maz-11279.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)


WARNING:root:Segment '7' in file 'maz-11279-excerpt.rs3' is a non-root nucleus without children
WARNING:root:Segment '7' in file 'maz-11279.rs3' is a non-root nucleus without children

In [102]:
produced = rstviewer_vs_rsttree('maz-14654.rs3', rs3tree_dir=PCC_RS3_DIR)


---------------------------------------------------------------------------
TooManyChildrenError                      Traceback (most recent call last)
<ipython-input-102-60f0c4fe079e> in <module>()
----> 1 produced = rstviewer_vs_rsttree('maz-14654.rs3', rs3tree_dir=PCC_RS3_DIR)

<ipython-input-42-9491464ce88a> in rstviewer_vs_rsttree(rs3tree_example_filename, rs3tree_dir, debug, word_wrap)
      3     rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
      4     embed_rs3_image(rs3_filepath)
----> 5     rst_tree = RSTTree(rs3_filepath, word_wrap=word_wrap, debug=debug)
      6     IPython.core.display.display(rst_tree)
      7     return rst_tree

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in __init__(self, rs3_file, word_wrap, debug)
     47         self.edu_strings = [self.elem_dict[edu_id]['text']
     48                             for edu_id in self.edus]
---> 49         self.tree = self.dt()
     50 
     51     def _repr_png_(self):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
     85         """
     86         if start_node is None:
---> 87             return self.root2tree(start_node=start_node)
     88 
     89         elem_id = start_node

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in root2tree(self, start_node)
    120             root_subtrees = [self.dt(start_node=root_id)
    121                              for root_id in root_nodes]
--> 122             sorted_subtrees = self.sort_subtrees(*root_subtrees)
    123             # ensure that each subtree is marked as a nucleus
    124             nuc_subtrees = [n_wrap(st, debug=self.debug) for st in sorted_subtrees]

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
    102         else:
    103             return self.group2tree(
--> 104                 elem_id, elem, elem_type, start_node=start_node)
    105 
    106     def root2tree(self, start_node=None):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in group2tree(self, elem_id, elem, elem_type, start_node)
    213                     child_id = self.child_dict[elem_id][0]
    214                     return self.dt(start_node=child_id)
--> 215 
    216                 elif len(self.child_dict[elem_id]) == 2:
    217                     # this elem is the N of an N-S relation (child: S), but is also

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
    102         else:
    103             return self.group2tree(
--> 104                 elem_id, elem, elem_type, start_node=start_node)
    105 
    106     def root2tree(self, start_node=None):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in group2tree(self, elem_id, elem, elem_type, start_node)
    205                         "Can't parse a multinuc group (%s) with more than 2 non-multinuc children: %s" \
    206                             % (elem_id, other_child_ids))
--> 207 
    208             else:
    209                 #~ assert elem['group_type'] == 'span', \

TooManyChildrenError: Can't parse a multinuc group (28) with more than 2 non-multinuc children: ['25', '30', '31']

In [115]:
rstviewer_vs_rsttree('multinuc-plus-two-satellites.rs3', rs3tree_dir=RS3TREE_DIR)


---------------------------------------------------------------------------
TooManyChildrenError                      Traceback (most recent call last)
<ipython-input-115-6cf6d421df86> in <module>()
----> 1 rstviewer_vs_rsttree('multinuc-plus-two-satellites.rs3', rs3tree_dir=RS3TREE_DIR)

<ipython-input-42-9491464ce88a> in rstviewer_vs_rsttree(rs3tree_example_filename, rs3tree_dir, debug, word_wrap)
      3     rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
      4     embed_rs3_image(rs3_filepath)
----> 5     rst_tree = RSTTree(rs3_filepath, word_wrap=word_wrap, debug=debug)
      6     IPython.core.display.display(rst_tree)
      7     return rst_tree

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in __init__(self, rs3_file, word_wrap, debug)
     47         self.edu_strings = [self.elem_dict[edu_id]['text']
     48                             for edu_id in self.edus]
---> 49         self.tree = self.dt()
     50 
     51     def _repr_png_(self):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
     85         """
     86         if start_node is None:
---> 87             return self.root2tree(start_node=start_node)
     88 
     89         elem_id = start_node

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in root2tree(self, start_node)
    108         num_roots = len(root_nodes)
    109         if num_roots == 1:
--> 110             return self.dt(start_node=root_nodes[0])
    111         elif num_roots > 1:
    112             # An undesired, but common case (at least in the PCC corpus).

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
    102         else:
    103             return self.group2tree(
--> 104                 elem_id, elem, elem_type, start_node=start_node)
    105 
    106     def root2tree(self, start_node=None):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in group2tree(self, elem_id, elem, elem_type, start_node)
    205                         "Can't parse a multinuc group (%s) with more than 2 non-multinuc children: %s" \
    206                             % (elem_id, other_child_ids))
--> 207 
    208             else:
    209                 #~ assert elem['group_type'] == 'span', \

TooManyChildrenError: Can't parse a multinuc group (7) with more than 2 non-multinuc children: ['1', '4', '8']

In [116]:
joint_5_6 = ('joint', [
    ('N', ['fuenf']),
    ('N', ['sechs'])
])

conj_2_3 = ('conjunction', [
    ('N', ['zwei']),
    ('N', ['drei'])
])

inter_2_4 = ('interpretation', [
    ('N', [conj_2_3]),
    ('S', ['vier'])
])

inter_2_6 = ('interpretation', [
    ('N', [inter_2_4]),
    ('S', [joint_5_6])])

t('interpretation', [
    ('S', ['eins']),
    ('N', [inter_2_6])
])


Out[116]:

TODO: fix root_id

root_id is used to extract the relname, but it must never be span

maz-11279: TODO: make small test case for non-root N w/out children


In [44]:
produced = rstviewer_vs_rsttree(
    'maz-11279-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)


WARNING:root:Segment '7' in file 'maz-11279-excerpt.rs3' is a non-root nucleus without children

maz-6918: small FIXME: relations w/out debug label


In [45]:
produced = rstviewer_vs_rsttree(
    'maz-6918.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)



In [46]:
produced = rstviewer_vs_rsttree(
    'maz-6918-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)


maz-00001.rs3


In [9]:
produced = rstviewer_vs_rsttree(
    'maz-00001.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)



In [34]:
con_4_5 = ('conjunction', [
    n(['4']),
    n(['5'])
])

cause_3_5 = ('cause', [
    s(['3']),
    n([con_4_5])
])

cause_6_7 = ('cause', [
    n(['6']),
    s(['7'])
])

inter_3_7 = ('interpretation', [
    n([cause_3_5]),
    s([cause_6_7])
])

inter_2_7 = ('interpretation', [
    s(['2']),
    n([inter_3_7])
])

eval_2_8 = ('evaluation-n', [
    s([inter_2_7]),
    n(['8'])
])

cond_13_14 = ('condition', [
    n(['13']),
    s(['14'])
])

conj_12_14 = ('conjunction', [
    n(['12']),
    n([cond_13_14])
])

evidence_11_14 = ('evidence', [
    n(['11']),
    s([conj_12_14])
])

reason_10_14 = ('reason', [
    n(['10']),
    s([evidence_11_14])
])

list_9_14 = ('list', [
    n(['9']),
    n([reason_10_14])
])

reason_2_14 = ('reason', [
    n([eval_2_8]),
    s([list_9_14])
])

reason_17_18 = ('reason', [
    n(['17']),
    s(['18'])
])

conj_16_18 = ('conjunction', [
    n(['16']),
    n([reason_17_18])
])

reason_15_18 = ('reason', [
    n(['15']),
    s([conj_16_18])
])

elab_19_20 = ('elaboration', [
    n(['19']),
    s(['20'])
])

dis_21_22 = ('disjunction', [
    n(['21']),
    n(['22'])
])

inter_19_22 = ('interpretation', [
    s([elab_19_20]),
    n([dis_21_22])
])

result_15_22 = ('result', [
    n([reason_15_18]),
    s([inter_19_22])
])

joint_2_22 = ('joint', [
    n([reason_2_14]),
    n([result_15_22])
])

expected = t('virtual-root', [
    n(['1']),
    n([joint_2_22])
])


Out[34]:

In [12]:
produced = rstviewer_vs_rsttree(
    'maz-00001-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)


maz-14654.rs3


In [11]:
"""
    194                     raise TooManyChildrenError(
    195                         "Can't parse a multinuc group (%s) with more than 2 non-multinuc children: %s" \
--> 196                             % (elem_id, other_child_ids))
    197 
    198             else:

TooManyChildrenError: Can't parse a multinuc group (28) with more than 2 non-multinuc children: ['25', '30', '31']
"""
produced = rstviewer_vs_rsttree(
    'maz-14654-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)


---------------------------------------------------------------------------
TooManyChildrenError                      Traceback (most recent call last)
<ipython-input-11-33f31e17a7fe> in <module>()
      9 """
     10 produced = rstviewer_vs_rsttree(
---> 11     'maz-14654-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)

<ipython-input-3-9491464ce88a> in rstviewer_vs_rsttree(rs3tree_example_filename, rs3tree_dir, debug, word_wrap)
      3     rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
      4     embed_rs3_image(rs3_filepath)
----> 5     rst_tree = RSTTree(rs3_filepath, word_wrap=word_wrap, debug=debug)
      6     IPython.core.display.display(rst_tree)
      7     return rst_tree

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in __init__(self, rs3_file, word_wrap, debug)
     47         self.edu_strings = [self.elem_dict[edu_id]['text']
     48                             for edu_id in self.edus]
---> 49         self.tree = self.dt()
     50 
     51     def _repr_png_(self):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
     85         """
     86         if start_node is None:
---> 87             return self.root2tree(start_node=start_node)
     88 
     89         elem_id = start_node

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in root2tree(self, start_node)
    120 
    121             root_subtrees = [self.dt(start_node=root_id)
--> 122                              for root_id in root_nodes]
    123             # ensure that each subtree is marked as a nucleus
    124             nuc_subtrees = [n_wrap(st, debug=self.debug) for st in root_subtrees]

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
    102         else:
    103             return self.group2tree(
--> 104                 elem_id, elem, elem_type, start_node=start_node)
    105 
    106     def root2tree(self, start_node=None):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in group2tree(self, elem_id, elem, elem_type, start_node)
    213                     # this span at the top of a tree was only added for visual purposes
    214                     child_id = self.child_dict[elem_id][0]
--> 215                     return self.dt(start_node=child_id)
    216 
    217                 elif len(self.child_dict[elem_id]) == 2:

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in dt(self, start_node)
    102         else:
    103             return self.group2tree(
--> 104                 elem_id, elem, elem_type, start_node=start_node)
    105 
    106     def root2tree(self, start_node=None):

/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/rs3/rs3tree.pyc in group2tree(self, elem_id, elem, elem_type, start_node)
    205                     raise TooManyChildrenError(
    206                         "Can't parse a multinuc group (%s) with more than 2 non-multinuc children: %s" \
--> 207                             % (elem_id, other_child_ids))
    208 
    209             else:

TooManyChildrenError: Can't parse a multinuc group (28) with more than 2 non-multinuc children: ['25', '30', '31']

In [ ]: