In [1]:
    
import codecs
import os
import re
import discoursegraphs as dg
from discoursegraphs.readwrite.tree import t
from discoursegraphs.readwrite.rst.dis.common import DisFile
    
In [2]:
    
maz_14399_rs3 = '/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-14399.rs3'
minimal_rs3 = '/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/foo-bar-circ-bar-to-foo.rs3'
min2_rs3 = '/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/eins-zwei-drei-(joint-eins-and-zwei-and-drei).rs3'
    
In [3]:
    
dg.read_rs3tree(min2_rs3)
    
    Out[3]:
In [4]:
    
MIN_RS3_STR = """
<rst>
  <header>
    <relations>
      <rel name="elaboration" type="rst" />
      <rel name="joint" type="multinuc" />
    </relations>
  </header>
  <body>
    <segment id="1" parent="4" relname="joint">eins zwei  drei</segment>
    <segment id="2" parent="4" relname="joint">vier   fünf  sechs   .</segment>
    <segment id="3" parent="4" relname="joint">I  bims .  Wüss'   iss  ? I  woaß es  a net!</segment>
    <group id="4" type="multinuc" />
  </body>
</rst>
"""
    
In [5]:
    
min_rs3_tree = dg.read_rs3tree.fromstring(MIN_RS3_STR)
min_rs3_tree # OKAY
    
    Out[5]:
In [6]:
    
dg.write_dis(min_rs3_tree, '/tmp/min_rs3_tree.dis')
    
    Out[6]:
In [7]:
    
dg.read_distree('/tmp/min_rs3_tree.dis')
    
    Out[7]:
In [23]:
    
min_rs3_tree.tree.pformat() == dg.read_distree('/tmp/min_rs3_tree.dis').tree.pformat()
    
    Out[23]:
In [8]:
    
okay = 0
fail = 0
for rs3_file in dg.corpora.pcc.get_files_by_layer('rst'):
    rs3_fname = os.path.basename(rs3_file)
    rst_tree = dg.read_rs3tree(rs3_file)
    
    tree_fname = rs3_fname + '.tree'
    with codecs.open(tree_fname, 'w', 'utf-8') as outfile:
        outfile.write(rst_tree.tree.pformat())
    
    dis_fname = rs3_fname + '.dis'
    dis_tree = dg.write_dis(rst_tree, dis_fname)
    
    reconverted_rst_tree = dg.read_distree(dis_fname)
    reconverted_tree_fname = rs3_fname + '.tree-reconverted'
    with codecs.open(reconverted_tree_fname, 'w', 'utf-8') as outfile:
        outfile.write(reconverted_rst_tree.tree.pformat())
    
    if rst_tree.tree.pformat() != reconverted_rst_tree.tree.pformat():
        fail += 1
        print "FAIL: ", rs3_fname, os.path.getsize(reconverted_tree_fname)
    else:
        okay += 1
#         print "OKAY: ", rs3_fname
print "OKAY counts: ", okay
print "FAIL counts: ", fail
    
    
In [9]:
    
import re
DIS_TEST_STR = """( Root (span 1 2)
  ( Satellite (leaf 1) (rel2par Contrast) (text _!Sie hatte vor ( ! ) der langen Nacht die Zahlen gedruckt ._!) )
  ( Nucleus (leaf 2) (rel2par span) (text _!an den Initiator , den Internationalen Bund ( IB ) , übergeben ._!) )
)"""
    
In [10]:
    
dis_test_tree = dg.read_distree.fromstring(DIS_TEST_STR)
dis_test_tree # OKAY, correctly converted to RST tree
    
    Out[10]:
In [11]:
    
dis_test_tree.disfile_tree  # OKAY, correctly parsed
    
    Out[11]:
In [12]:
    
dg.write_dis(dis_test_tree) # OKAY, corectly converted back
    
    Out[12]:
In [13]:
    
dis_test_tree.disfile_tree.pformat() == dg.write_dis(dis_test_tree).disfiletree.pformat()
    
    Out[13]:
In [14]:
    
print dg.write_dis(dis_test_tree).disfiletree.pformat()
    
    
In [15]:
    
maz_8727_tree = dg.read_distree('maz-8727.rs3.dis', word_wrap=10)
maz_8727_tree # FAIL: '( ! )' is missing
    
    Out[15]:
In [16]:
    
maz_8727_tree.disfile_tree # FAIL, contains '!' instead of '( ! )'
    
    Out[16]:
In [17]:
    
maz_8727_tree.disfile_tree[2][3][3][3][3][2]
    
    Out[17]:
In [18]:
    
dis_subtree = maz_8727_tree.disfile_tree[2][3][3][3][3][2]
dis_subtree_str = dis_subtree.pformat()
print dis_subtree_str
    
    
In [19]:
    
u' '.join(line.strip() for line in dis_subtree_str.splitlines())
    
    Out[19]:
In [20]:
    
def join_lines(matchobj):
    edu_multiline_str = matchobj.group(0)
    ed_oneline_str = u' '.join(line.strip()
                               for line in edu_multiline_str.splitlines())
    return re.sub('\n', '', ed_oneline_str)
print re.sub('_!(.*?)_!', join_lines, dis_subtree_str, flags=re.DOTALL)
    
    
In [ ]: