In [1]:
import os
from tempfile import NamedTemporaryFile
from lxml import etree
from lxml.builder import E
import discoursegraphs as dg
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes, RSTTree
from discoursegraphs.readwrite.rst.rs3.rs3tree import (
is_leaf, NUCLEARITY_LABELS)
from discoursegraphs.readwrite.rst.rs3.rs3filewriter import RS3FileWriter
from discoursegraphs.readwrite.tree import DGParentedTree, t
In [2]:
RS3TREE_DIR = os.path.join(dg.DATA_ROOT_DIR, 'rs3tree')
PCC_RS3_DIR = os.path.join(dg.DATA_ROOT_DIR,
'potsdam-commentary-corpus-2.0.0', 'rst')
def example2tree(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR,
word_wrap=0, debug=False):
"""Given the filename of an rs3 file and its directory,
return an RSTTree instance of it."""
filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
return RSTTree(filepath, word_wrap=word_wrap, debug=debug)
def rs3view(rs3tree_example_filename, rs3tree_dir=RS3TREE_DIR):
filepath = os.path.join(rs3tree_dir, rs3tree_example_filename)
return rstviewer.embed_rs3_image(filepath)
def cat(filename, filedir_path=RS3TREE_DIR):
"""Print the content of the given file."""
filepath = os.path.join(filedir_path, filename)
with open(filepath) as textfile:
print(textfile.read())
In [3]:
os.listdir(RS3TREE_DIR)
Out[3]:
In [4]:
def etree2file(rst_etree):
"""Converts etree element into a temporary file, returns the file name."""
out_str = etree.tostring(rst_etree, pretty_print=True, encoding="UTF-8")
temp_file = NamedTemporaryFile(delete=False)
with open(temp_file.name, 'w') as outfile:
outfile.write(out_str)
return temp_file.name
In [5]:
dgtree = example2tree('empty.rs3')
dgtree
Out[5]:
In [6]:
cat('empty.rs3')
In [7]:
RS3FileWriter(dgtree)
Out[7]:
In [8]:
RS3FileWriter(example2tree('empty.rs3'), debug=False).etree
Out[8]:
In [9]:
example2tree(etree2file(RS3FileWriter(example2tree('empty.rs3'), debug=False).etree))
Out[9]:
In [10]:
dgtree = example2tree('only-one-segment.rs3')
dgtree
Out[10]:
In [11]:
RS3FileWriter(dgtree)
Out[11]:
In [12]:
example2tree(etree2file(RS3FileWriter(dgtree, debug=False).etree))
Out[12]:
In [13]:
cat('only-one-segment.rs3')
In [14]:
example2tree("foo-bar-circ-foo-to-bar.rs3")
Out[14]:
In [15]:
tree = example2tree("foo-bar-circ-foo-to-bar.rs3").tree
def cousins_treepos(treepos):
cousins_pos = []
mother_pos = parent_treepos(treepos)
grandma_pos = parent_treepos(mother_pos)
if grandma_pos is not None:
grandmas_children_pos = children_treepos(grandma_pos)
for grandmas_child_pos in grandmas_children_pos:
if grandmas_child_pos != mother_pos:
cousins_pos.extend(children_treepos(grandmas_child_pos))
return cousins_pos
In [16]:
example2tree(etree2file(RS3FileWriter(example2tree("foo-bar-circ-foo-to-bar.rs3")).etree))
Out[16]:
In [17]:
cat("foo-bar-circ-foo-to-bar.rs3")
In [18]:
example2tree('foo-bar-foo-conj-bar.rs3')
Out[18]:
In [19]:
cat('foo-bar-foo-conj-bar.rs3')
In [20]:
example2tree(etree2file(RS3FileWriter(example2tree('foo-bar-foo-conj-bar.rs3')).etree))
Out[20]:
In [21]:
dgtree = example2tree('eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3')
dgtree
Out[21]:
In [22]:
cat('eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3')
In [23]:
example2tree(etree2file(RS3FileWriter(dgtree).etree))
Out[23]:
In [24]:
# for fname in os.listdir(RS3TREE_DIR):
# if fname.startswith('maz'): print fname
In [25]:
# example2tree('maz-6918-excerpt.rs3') # 12
example2tree('maz-10575-excerpt.rs3') # 5
# example2tree('maz-4472-excerpt.rs3') # 13
# example2tree('maz-12666-excerpt.rs3') # 22
# example2tree('maz-00001-excerpt.rs3') # 22
# example2tree('maz-3367-excerpt.rs3') # 23
Out[25]:
In [26]:
input_tree = t('interpretation', [
('N', [
('circumstance', [
('S', ['eins']),
('N', [
('contrast', [
('N', ['zwei']),
('N', [
('cause', [
('N', ['drei']),
('S', ['vier'])])])])])])]),
('S', ['fuenf'])])
In [27]:
dgtree = example2tree('maz-10575-excerpt.rs3')
In [28]:
dgtree
Out[28]:
In [29]:
dgtree.tree == input_tree
Out[29]:
In [30]:
# FIXME: should have 5 EDUs
empty_tree = example2tree(etree2file(RS3FileWriter(dgtree).etree))
empty_tree
Out[30]:
In [31]:
# FIXME: should have 5 EDUs; has no root group
RS3FileWriter(dgtree)
Out[31]:
In [32]:
etree2file(RS3FileWriter(dgtree).etree)
Out[32]:
In [33]:
r = RS3FileWriter(dgtree, debug=False)
r.dgtree.leaves()
Out[33]:
In [34]:
for i, leaf in enumerate(r.dgtree.leaves()):
print leaf, r.dgtree.leaf_treeposition(i)
In [35]:
r.treepositions
Out[35]:
In [36]:
for seg in r.body['segments']:
print seg.text
In [39]:
from rstviewer import embed_rs3_image
from IPython.display import display
for rfile in dg.corpora.pcc.get_files_by_layer('rst')[:10]:
print os.path.basename(rfile)
embed_rs3_image(rfile)
dgtree = RSTTree(rfile, word_wrap=10)
display(dgtree)
embed_rs3_image(etree2file(RS3FileWriter(dgtree, debug=False).etree))
In [40]:
for rfile in dg.corpora.pcc.get_files_by_layer('rst'):
dgtree = RSTTree(rfile)
RS3FileWriter(
dgtree, output_filepath='/tmp/reconverted/{}'.format(os.path.basename(rfile)), debug=False)
In [ ]: