In [1]:
import os
import discoursegraphs as dg
import discourseinfostat as di
In [2]:
TUEBADZ8_FILE = dg.corpora.TUEBADZ_PATH
In [3]:
corpus = dg.read_exportxml(TUEBADZ8_FILE, debug=False)
In [4]:
doc = corpus.next()
In [5]:
debug_corpus = dg.read_exportxml(TUEBADZ8_FILE, debug=True)
debug_doc = debug_corpus.next()
edg = dg.readwrite.exportxml.ExportXMLDocumentGraph(debug_doc)
In [6]:
# %matplotlib inline
# %load_ext gvmagic
In [7]:
# %dotstr dg.print_dot(doc)
In [8]:
# for i, doc in enumerate(corpus):
# try:
# dg.print_dot(doc)
# # print i, "sucess"
# except:
# print i, "failed"
In [9]:
# dg.info(doc)
In [10]:
import itertools
onset = 0
offset = 0
for token_id, token_str in itertools.islice(doc.get_tokens(), 5):
offset = onset + len(token_str)
print onset, token_str, offset
doc.node[token_id]['{}:{}'.format(doc.ns, 'onset')] = onset
doc.node[token_id]['{}:{}'.format(doc.ns, 'offset')] = offset
print doc.node[token_id]
onset = offset + 1
In [11]:
print dg.get_text(doc)
exportxml:coreference and exportxml:coreferential?The corpus metadata only has coreferential.
I introduced coreference myself; this layer will be added
to all <relation> with a target attribute, as well as all
<splitRelation>s.
<edge name="relation" parent="word|node">
<enum-attr name="type">
<val name="anaphoric" description="Anaphorisches Pronomen"/>
<val name="cataphoric" description="Kataphorisches Pronomen"/>
<val name="coreferential" description="Diskurs-altes nicht-Pronomen"/>
</enum-attr>
<node-ref name="target"/>
</edge>
In [12]:
len(dg.get_pointing_chains(doc, layer='exportxml:coreference'))
Out[12]:
In [13]:
len(dg.get_pointing_chains(doc, layer='exportxml:coreferential'))
Out[13]:
In [14]:
coreference_set = set(tuple(coref_list)
for coref_list in dg.get_pointing_chains(doc, layer='exportxml:coreference'))
In [15]:
coreferential_set = set(tuple(coref_list)
for coref_list in dg.get_pointing_chains(doc, layer='exportxml:coreferential'))
In [16]:
not_in_coreference_set = coreferential_set.difference(coreference_set)
In [17]:
not_in_coreferential_set = coreference_set.difference(coreferential_set)
In [18]:
for coref_chain in not_in_coreference_set:
print ' -> '.join(dg.get_text(doc, token) for token in coref_chain)
print
In [19]:
for coref_chain in not_in_coreferential_set:
print ' -> '.join(dg.get_text(doc, token) for token in coref_chain)
print
coreference often seem to subsume the ones from coreferentialThis is how relations are specified in Tueba:
<edge name="relation" parent="word|node">
<enum-attr name="type">
<val name="anaphoric" description="Anaphorisches Pronomen"/>
<val name="cataphoric" description="Kataphorisches Pronomen"/>
<val name="coreferential" description="Diskurs-altes nicht-Pronomen"/>
</enum-attr>
<node-ref name="target"/>
</edge>
coreferential edges are constructedThe parent node of a <relation> element will have a {'relation' : 'coreferential'} attribute,
as well as an exportxml:coreferential layer.
self.node[parent_node_id].update({'relation': reltype})
self.add_layer(parent_node_id, self.ns+':'+reltype)
for each <relation> with a target attribute,
add_relation() adds an edge with both its reltype
as well as exportxml:coreference as layers.
# reltype: anaphoric, cataphoric, coreferential
self.add_edge(parent_node_id, target_id,
layers={self.ns, self.ns+':'+reltype,
self.ns+':coreference'},
label=reltype,
edge_type=dg.EdgeTypes.pointing_relation)
In [ ]: