In [61]:
    
import os
from lxml import etree
from nltk.tree import ParentedTree
from rstviewer import embed_rs3_image, embed_rs3str_image
import discoursegraphs as dg
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes, RSTTree
from discoursegraphs.readwrite.rst.rs3.rs3tree import n, s, debug_root_label
from discoursegraphs import t
import IPython
    
In [41]:
    
PCC_RS3_DIR = os.path.join(dg.DATA_ROOT_DIR, 'potsdam-commentary-corpus-2.0.0', 'rst')
RS3TREE_DIR = os.path.join(dg.DATA_ROOT_DIR, 'rs3tree')
REPO_RS3TREE_DIR = "/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree"
    
In [42]:
    
def rstviewer_vs_rsttree(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR,
                         debug=False, word_wrap=0):
    rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
    embed_rs3_image(rs3_filepath)
    rst_tree = RSTTree(rs3_filepath, word_wrap=word_wrap, debug=debug)
    IPython.core.display.display(rst_tree)
    return rst_tree
def print_example(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR):
    rs3_filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
    with open(rs3_filepath, 'r') as rs3_file:
        print(rs3_file.read())
    
In [43]:
    
def create_excert_file(pccrs3_input_filename):
    rs3_filepath = os.path.join(PCC_RS3_DIR, pccrs3_input_filename)
    tree = etree.parse(rs3_filepath)
    for i, segment in enumerate(tree.iter('segment'), 1):
        segment.text = str(i)
    doc_id, ext = pccrs3_input_filename.split('.')
    output_filename = "{}-excerpt.{}".format(doc_id, ext)
    output_filepath = os.path.join(REPO_RS3TREE_DIR, output_filename)
    output_str = etree.tostring(tree, pretty_print=True,
                   xml_declaration=True, encoding='UTF-8')
    with open(output_filepath, 'w') as out_file:
        out_file.write(output_str)
        print(output_filepath)
    
In [98]:
    
# broken_test_files = ['maz-11279.rs3', 'maz-6918.rs3', 'maz-00001.rs3', 'maz-14654.rs3']
broken_test_files = ['maz-12666.rs3', 'maz-14813.rs3', 'maz-11279.rs3', 'maz-14654.rs3', 'maz-4472.rs3']
for broken_test_file in broken_test_files:
    create_excert_file(broken_test_file)
    
    
In [ ]:
    
def no_span_nodes(tree, debug=False, root_id=None):
    """Return True, iff there is no span node in the given ParentedTree."""
    assert isinstance(tree, ParentedTree)
    if root_id is None:
        root_id = tree.root_id
    span_label = debug_root_label('span', debug=debug, root_id=root_id)
    if tree.label() == span_label:
        return False
    for node in tree:
        if isinstance(node, ParentedTree) :
            if node.label() == span_label:
                return False
            return no_span_nodes(node, debug=debug, root_id=root_id)
    return True
    
In [96]:
    
t('joint', [
        ('N', ['foo']),
        ('N', [
            ('background', [
                ('S', ['bar']),
                ('N', ['baz'])])]),
        ]
    )
    
    Out[96]:
In [ ]:
    
t('')
    
In [99]:
    
produced = rstviewer_vs_rsttree(
    'maz-11279-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=False)
produced = rstviewer_vs_rsttree(
    'maz-11279.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)
    
    
    
    
    
    
    
In [102]:
    
produced = rstviewer_vs_rsttree('maz-14654.rs3', rs3tree_dir=PCC_RS3_DIR)
    
    
    
In [115]:
    
rstviewer_vs_rsttree('multinuc-plus-two-satellites.rs3', rs3tree_dir=RS3TREE_DIR)
    
    
    
In [116]:
    
joint_5_6 = ('joint', [
    ('N', ['fuenf']),
    ('N', ['sechs'])
])
conj_2_3 = ('conjunction', [
    ('N', ['zwei']),
    ('N', ['drei'])
])
inter_2_4 = ('interpretation', [
    ('N', [conj_2_3]),
    ('S', ['vier'])
])
inter_2_6 = ('interpretation', [
    ('N', [inter_2_4]),
    ('S', [joint_5_6])])
t('interpretation', [
    ('S', ['eins']),
    ('N', [inter_2_6])
])
    
    Out[116]:
In [44]:
    
produced = rstviewer_vs_rsttree(
    'maz-11279-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)
    
    
    
    
In [45]:
    
produced = rstviewer_vs_rsttree(
    'maz-6918.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)
    
    
    
In [46]:
    
produced = rstviewer_vs_rsttree(
    'maz-6918-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)
    
    
    
In [9]:
    
produced = rstviewer_vs_rsttree(
    'maz-00001.rs3', rs3tree_dir=PCC_RS3_DIR, word_wrap=10, debug=True)
    
    
    
In [34]:
    
con_4_5 = ('conjunction', [
    n(['4']),
    n(['5'])
])
cause_3_5 = ('cause', [
    s(['3']),
    n([con_4_5])
])
cause_6_7 = ('cause', [
    n(['6']),
    s(['7'])
])
inter_3_7 = ('interpretation', [
    n([cause_3_5]),
    s([cause_6_7])
])
inter_2_7 = ('interpretation', [
    s(['2']),
    n([inter_3_7])
])
eval_2_8 = ('evaluation-n', [
    s([inter_2_7]),
    n(['8'])
])
cond_13_14 = ('condition', [
    n(['13']),
    s(['14'])
])
conj_12_14 = ('conjunction', [
    n(['12']),
    n([cond_13_14])
])
evidence_11_14 = ('evidence', [
    n(['11']),
    s([conj_12_14])
])
reason_10_14 = ('reason', [
    n(['10']),
    s([evidence_11_14])
])
list_9_14 = ('list', [
    n(['9']),
    n([reason_10_14])
])
reason_2_14 = ('reason', [
    n([eval_2_8]),
    s([list_9_14])
])
reason_17_18 = ('reason', [
    n(['17']),
    s(['18'])
])
conj_16_18 = ('conjunction', [
    n(['16']),
    n([reason_17_18])
])
reason_15_18 = ('reason', [
    n(['15']),
    s([conj_16_18])
])
elab_19_20 = ('elaboration', [
    n(['19']),
    s(['20'])
])
dis_21_22 = ('disjunction', [
    n(['21']),
    n(['22'])
])
inter_19_22 = ('interpretation', [
    s([elab_19_20]),
    n([dis_21_22])
])
result_15_22 = ('result', [
    n([reason_15_18]),
    s([inter_19_22])
])
joint_2_22 = ('joint', [
    n([reason_2_14]),
    n([result_15_22])
])
expected = t('virtual-root', [
    n(['1']),
    n([joint_2_22])
])
    
    Out[34]:
In [12]:
    
produced = rstviewer_vs_rsttree(
    'maz-00001-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)
    
    
    
In [11]:
    
"""
    194                     raise TooManyChildrenError(
    195                         "Can't parse a multinuc group (%s) with more than 2 non-multinuc children: %s" \
--> 196                             % (elem_id, other_child_ids))
    197 
    198             else:
TooManyChildrenError: Can't parse a multinuc group (28) with more than 2 non-multinuc children: ['25', '30', '31']
"""
produced = rstviewer_vs_rsttree(
    'maz-14654-excerpt.rs3', rs3tree_dir=RS3TREE_DIR, word_wrap=10, debug=True)
    
    
    
In [ ]: