In [1]:
    
from collections import defaultdict, OrderedDict
from lxml import etree
from discoursegraphs import t
from discoursegraphs.readwrite.rst.rs3 import RSTTree, dt, extract_relationtypes, get_rs3_data
from discoursegraphs.readwrite.tree import get_position
from rstviewer import embed_rs3_image, embed_rs3str_image
    
In [2]:
    
RS3_EXAMPLE_FILE = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-1818.rs3'
    
In [3]:
    
embed_rs3_image(RS3_EXAMPLE_FILE)
    
    
In [4]:
    
RSTTree(RS3_EXAMPLE_FILE)
    
    Out[4]:
In [5]:
    
RSTTree(RS3_EXAMPLE_FILE, word_wrap=10)
    
    Out[5]:
In [6]:
    
RSTTree(RS3_EXAMPLE_FILE, word_wrap=15)[1][0]
    
    Out[6]:
In [7]:
    
cdict, edict, ordered_edus = get_rs3_data(RS3_EXAMPLE_FILE)
edu_set = set(ordered_edus)
    
In [8]:
    
dt(cdict, edict, ordered_edus, start_node=None)
    
    Out[8]:
In [9]:
    
dt(cdict, edict, ordered_edus, start_node='1') # OK
    
    Out[9]:
In [10]:
    
dt(cdict, edict, ordered_edus, start_node='11') # OK
    
    Out[10]:
In [11]:
    
dt(cdict, edict, ordered_edus, start_node='2') # OK
    
    Out[11]:
In [12]:
    
dt(cdict, edict, ordered_edus, start_node='3') # OK
    
    Out[12]:
In [13]:
    
dt(cdict, edict, ordered_edus, start_node='4') # OK
    
    Out[13]:
In [14]:
    
dt(cdict, edict, ordered_edus, start_node='5') # OK
    
    Out[14]:
In [15]:
    
dt(cdict, edict, ordered_edus, start_node='6') # OK
    
    Out[15]:
In [16]:
    
dt(cdict, edict, ordered_edus, start_node='7') # OK
    
    Out[16]:
In [17]:
    
dt(cdict, edict, ordered_edus, start_node='8') # OK
    
    Out[17]:
In [18]:
    
dt(cdict, edict, ordered_edus, start_node='9') # OK
    
    Out[18]:
In [19]:
    
dt(cdict, edict, ordered_edus, start_node='10') # OK
    
    Out[19]:
In [20]:
    
dt(cdict, edict, ordered_edus, start_node='12') # OK
    
    Out[20]:
In [21]:
    
dt(cdict, edict, ordered_edus, start_node='13') # OK
    
    Out[21]:
In [22]:
    
dt(cdict, edict, ordered_edus, start_node='14') # OK
    
    Out[22]:
In [23]:
    
dt(cdict, edict, ordered_edus, start_node='15') # OK
    
    Out[23]:
In [24]:
    
dt(cdict, edict, ordered_edus, start_node='16') # OK
    
    Out[24]:
In [25]:
    
dt(cdict, edict, ordered_edus, start_node='17') # OK
    
    Out[25]:
In [26]:
    
dt(cdict, edict, ordered_edus, start_node='18') # OK
    
    Out[26]:
In [27]:
    
dt(cdict, edict, ordered_edus, start_node='19') # OK
    
    Out[27]:
In [28]:
    
dt(cdict, edict, ordered_edus, start_node='20') # OK
    
    Out[28]:
In [29]:
    
RSTTree(RS3_EXAMPLE_FILE)
    
    Out[29]:
In [30]:
    
maz_4031 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-4031.rs3'
    
In [31]:
    
# TODO: show Manfred. Should be illegal: concession(6-9 -> 5) and concession(10-11 -> 5)!
embed_rs3_image(maz_4031)
    
    
In [32]:
    
# maz_6728 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-6728.rs3'
    
In [33]:
    
# TODO: show Manfred. Should be illegal: reason(6-8 -> 2-5) AND evidence(9-11 -> 2-5)
# embed_rs3_image(maz_6728)
    
In [34]:
    
maz_18377 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-18377.rs3'
    
In [35]:
    
# embed_rs3_image(maz_18377)
    
In [36]:
    
RSTTree(maz_18377)
    
    Out[36]:
In [37]:
    
maz_4031 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-4031.rs3'
    
In [38]:
    
# embed_rs3_image(maz_4031)
    
In [39]:
    
# AssertionError: A multinuc group (33) should not have > 1 non-multinuc children:
# ['26', '30', '32', '22', '36']
maz_2669 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-2669.rs3'
    
In [40]:
    
# embed_rs3_image(maz_2669)
    
In [41]:
    
# AssertionError: A multinuc group (19) should not have > 1 non-multinuc children: []
maz_14813 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-14813.rs3'
    
In [42]:
    
cdict, edict, ordered_edus = get_rs3_data(maz_14813, word_wrap=20)
edu_set = set(ordered_edus)
    
In [43]:
    
# embed_rs3_image(maz_14813)
    
In [44]:
    
# AssertionError: A multinuc group (19) should not have > 1 non-multinuc children: []
maz_7220 = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-7220.rs3'
    
In [45]:
    
cdict, edict, ordered_edus = get_rs3_data(maz_7220, word_wrap=10)
edu_set = set(ordered_edus)
    
In [46]:
    
# embed_rs3_image(maz_7220)
    
In [47]:
    
dt(cdict, edict, ordered_edus, start_node='19')
    
    Out[47]:
In [48]:
    
# FAIL: name of relation missing: conjunction(8, 11)
dt(cdict, edict, ordered_edus, start_node='12')
    
    Out[48]:
In [49]:
    
cdict['12']
    
    Out[49]:
In [50]:
    
edict['12']
    
    Out[50]:
In [51]:
    
edict['8']
    
    Out[51]:
In [52]:
    
import os
import traceback
from collections import Counter
from discoursegraphs.corpora import pcc
counter = Counter()
exceptions = Counter()
for rs3_file in pcc.rst:
    try:
        rt = RSTTree(rs3_file)
        counter["wins"] += 1
    except Exception as e:
        print rs3_file
        print("FAIL")
#         print traceback.format_exc()
        print e
        counter["fails"] += 1
#         exceptions[e.__class__.__name__] += 1
        exceptions[e] += 1
        print
    
    
In [53]:
    
print counter, "\n"
for e, count in exceptions.most_common():
    print e, count
    
    
AssertionError("A span segment (5) should have one child: ['17', '20']",)
AssertionError("A multinuc group (27) should not have > 1 non-multinuc children: ['5', '6', '7', '10', '28', '29']",)
ValueError("A span group ('21') should not have > 2 children: ['4', '19', '20']",)
In [ ]: