In [1]:
import codecs
import os
import re
import discoursegraphs as dg
from discoursegraphs.readwrite.tree import t
from discoursegraphs.readwrite.rst.dis.common import DisFile
In [2]:
maz_14399_rs3 = '/home/arne/.virtualenvs/discoursegraphs/local/lib/python2.7/site-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-14399.rs3'
minimal_rs3 = '/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/foo-bar-circ-bar-to-foo.rs3'
min2_rs3 = '/home/arne/repos/discoursegraphs/src/discoursegraphs/data/rs3tree/eins-zwei-drei-(joint-eins-and-zwei-and-drei).rs3'
In [3]:
dg.read_rs3tree(min2_rs3)
Out[3]:
In [4]:
MIN_RS3_STR = """
<rst>
<header>
<relations>
<rel name="elaboration" type="rst" />
<rel name="joint" type="multinuc" />
</relations>
</header>
<body>
<segment id="1" parent="4" relname="joint">eins zwei drei</segment>
<segment id="2" parent="4" relname="joint">vier fünf sechs .</segment>
<segment id="3" parent="4" relname="joint">I bims . Wüss' iss ? I woaß es a net!</segment>
<group id="4" type="multinuc" />
</body>
</rst>
"""
In [5]:
min_rs3_tree = dg.read_rs3tree.fromstring(MIN_RS3_STR)
min_rs3_tree # OKAY
Out[5]:
In [6]:
dg.write_dis(min_rs3_tree, '/tmp/min_rs3_tree.dis')
Out[6]:
In [7]:
dg.read_distree('/tmp/min_rs3_tree.dis')
Out[7]:
In [23]:
min_rs3_tree.tree.pformat() == dg.read_distree('/tmp/min_rs3_tree.dis').tree.pformat()
Out[23]:
In [8]:
okay = 0
fail = 0
for rs3_file in dg.corpora.pcc.get_files_by_layer('rst'):
rs3_fname = os.path.basename(rs3_file)
rst_tree = dg.read_rs3tree(rs3_file)
tree_fname = rs3_fname + '.tree'
with codecs.open(tree_fname, 'w', 'utf-8') as outfile:
outfile.write(rst_tree.tree.pformat())
dis_fname = rs3_fname + '.dis'
dis_tree = dg.write_dis(rst_tree, dis_fname)
reconverted_rst_tree = dg.read_distree(dis_fname)
reconverted_tree_fname = rs3_fname + '.tree-reconverted'
with codecs.open(reconverted_tree_fname, 'w', 'utf-8') as outfile:
outfile.write(reconverted_rst_tree.tree.pformat())
if rst_tree.tree.pformat() != reconverted_rst_tree.tree.pformat():
fail += 1
print "FAIL: ", rs3_fname, os.path.getsize(reconverted_tree_fname)
else:
okay += 1
# print "OKAY: ", rs3_fname
print "OKAY counts: ", okay
print "FAIL counts: ", fail
In [9]:
import re
DIS_TEST_STR = """( Root (span 1 2)
( Satellite (leaf 1) (rel2par Contrast) (text _!Sie hatte vor ( ! ) der langen Nacht die Zahlen gedruckt ._!) )
( Nucleus (leaf 2) (rel2par span) (text _!an den Initiator , den Internationalen Bund ( IB ) , übergeben ._!) )
)"""
In [10]:
dis_test_tree = dg.read_distree.fromstring(DIS_TEST_STR)
dis_test_tree # OKAY, correctly converted to RST tree
Out[10]:
In [11]:
dis_test_tree.disfile_tree # OKAY, correctly parsed
Out[11]:
In [12]:
dg.write_dis(dis_test_tree) # OKAY, corectly converted back
Out[12]:
In [13]:
dis_test_tree.disfile_tree.pformat() == dg.write_dis(dis_test_tree).disfiletree.pformat()
Out[13]:
In [14]:
print dg.write_dis(dis_test_tree).disfiletree.pformat()
In [15]:
maz_8727_tree = dg.read_distree('maz-8727.rs3.dis', word_wrap=10)
maz_8727_tree # FAIL: '( ! )' is missing
Out[15]:
In [16]:
maz_8727_tree.disfile_tree # FAIL, contains '!' instead of '( ! )'
Out[16]:
In [17]:
maz_8727_tree.disfile_tree[2][3][3][3][3][2]
Out[17]:
In [18]:
dis_subtree = maz_8727_tree.disfile_tree[2][3][3][3][3][2]
dis_subtree_str = dis_subtree.pformat()
print dis_subtree_str
In [19]:
u' '.join(line.strip() for line in dis_subtree_str.splitlines())
Out[19]:
In [20]:
def join_lines(matchobj):
edu_multiline_str = matchobj.group(0)
ed_oneline_str = u' '.join(line.strip()
for line in edu_multiline_str.splitlines())
return re.sub('\n', '', ed_oneline_str)
print re.sub('_!(.*?)_!', join_lines, dis_subtree_str, flags=re.DOTALL)
In [ ]: