In [1]:
maz_13125_s389_str = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s389" art_id="13125" orig_id="ID_maz-13125">
<graph root="s389_503">
<terminals>
<t id="s389_1" word="Was" lemma="--" pos="PWS" morph="--"/>
<t id="s389_2" word="man" lemma="--" pos="PIS" morph="--"/>
<t id="s389_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"/>
<t id="s389_4" word="durch" lemma="--" pos="APPR" morph="--"/>
<t id="s389_5" word="Augenschein" lemma="--" pos="NN" morph="--"/>
<t id="s389_6" word="nachprüfen" lemma="--" pos="VVINF" morph="--"/>
<t id="s389_7" word="kann" lemma="--" pos="VMFIN" morph="--"/>
<t id="s389_8" word="," lemma="--" pos="$," morph="--"/>
<t id="s389_9" word="ist" lemma="--" pos="VAFIN" morph="--"/>
<t id="s389_10" word="manipulierbar" lemma="--" pos="ADJD" morph="--"/>
<t id="s389_11" word="." lemma="--" pos="$." morph="--"/>
</terminals>
<nonterminals>
<nt id="s389_500" cat="PP">
<edge label="AC" idref="s389_4"/>
<edge label="NK" idref="s389_5"/>
</nt>
<nt id="s389_501" cat="VP">
<edge label="OA" idref="s389_1"/>
<edge label="HD" idref="s389_6"/>
<edge label="MO" idref="s389_500"/>
</nt>
<nt id="s389_502" cat="S">
<edge label="SB" idref="s389_2"/>
<edge label="NG" idref="s389_3"/>
<edge label="HD" idref="s389_7"/>
<edge label="OC" idref="s389_501"/>
</nt>
<nt id="s389_503" cat="S">
<edge label="HD" idref="s389_9"/>
<edge label="PD" idref="s389_10"/>
<edge label="SB" idref="s389_502"/>
</nt>
</nonterminals>
</graph>
</s>
"""
In [2]:
from lxml import etree
import discoursegraphs as dg
maz_13125_s389 = etree.fromstring(maz_13125_s389_str)
tsg = dg.readwrite.tiger.TigerSentenceGraph(maz_13125_s389)
In [3]:
print dg.print_dot(tsg)
In [4]:
# %load_ext gvmagic
In [5]:
# %dotstr dg.print_dot(tsg)
In [6]:
foo = u"""
digraph "" {
"discoursegraph:root_node";
"VROOT-s389";
"s389_500" [label="PP"];
"s389_501" [label="VP"];
"s389_502" [label="S"];
"s389_503" [label="S"];
"s389_1" [label="Was"];
"s389_2" [label="man"];
"s389_3" [label="nicht"];
"s389_4" [label="durch"];
"s389_5" [label="Augenschein"];
"s389_6" [label="nachprüfen"];
"s389_7" [label="kann"];
"s389_8" [label=","];
"s389_9" [label="ist"];
"s389_10" [label="manipulierbar"];
"s389_11" [label="."];
edge [style="invis"];
{rank=same; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];
"discoursegraph:root_node" -> "VROOT-s389" [key=0]; // changed direction
"s389_501" -> "s389_6" [key=0, label="HD"];
"s389_501" -> "s389_1" [key=0, label="OA"];
"s389_501" -> "s389_500" [key=0, label="MO"];
"s389_500" -> "s389_5" [key=0, label="NK"];
"s389_500" -> "s389_4" [key=0, label="AC"];
"s389_503" -> "s389_9" [key=0, label="HD"];
"s389_503" -> "s389_10" [key=0, label="PD"];
"s389_503" -> "s389_502" [key=0, label="SB"];
"s389_502" -> "s389_2" [key=0, label="SB"];
"s389_502" -> "s389_3" [key=0, label="NG"];
"s389_502" -> "s389_501" [key=0, label="OC"];
"s389_502" -> "s389_7" [key=0, label="HD"];
"VROOT-s389" -> "s389_503" [key=0];
"VROOT-s389" -> "s389_8" [key=0];
"VROOT-s389" -> "s389_11" [key=0];
}
"""
# %dotstr foo
In [7]:
minimally_changed = u"""
digraph "" {
graph [splines=ortho; nodesep=0.1]
"s389_1" [label="Was"];
"s389_501" [label="VP"];
"s389_10" [label="manipulierbar"];
"s389_2" [label="man"];
"s389_500" [label="PP"];
"s389_503" [label="S"];
"s389_502" [label="S"];
"s389_6" [label="nachprüfen"];
"VROOT-s389";
"s389_7" [label="kann"];
"s389_9" [label="ist"];
"s389_8" [label=","];
"s389_3" [label="nicht"];
"s389_5" [label="Augenschein"];
"s389_4" [label="durch"];
"s389_11" [label="."];
"s389_501" -> "s389_6" [key=0, label="HD"];
"s389_501" -> "s389_1" [key=0, label="OA"];
"s389_501" -> "s389_500" [key=0, label="MO"];
"s389_500" -> "s389_5" [key=0, label="NK"];
"s389_500" -> "s389_4" [key=0, label="AC"];
"s389_503" -> "s389_9" [key=0, label="HD"];
"s389_503" -> "s389_10" [key=0, label="PD"];
"s389_503" -> "s389_502" [key=0, label="SB"];
"s389_502" -> "s389_2" [key=0, label="SB"];
"s389_502" -> "s389_3" [key=0, label="NG"];
"s389_502" -> "s389_501" [key=0, label="OC"];
"s389_502" -> "s389_7" [key=0, label="HD"];
"VROOT-s389" -> "s389_503" [key=0];
"VROOT-s389" -> "s389_8" [key=0];
"VROOT-s389" -> "s389_11" [key=0];
edge [style="invis"];
{rank=sink; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];
}
"""
# %dotstr minimally_changed
In [8]:
for node_id in tsg:
if not dg.istoken(tsg, node_id):
print node_id, dg.get_span(tsg, node_id)
print node_id, dg.get_text(tsg, node_id), '\n'
In [9]:
for node_id in tsg:
try:
if not dg.istoken(tsg, node_id):
span_tokens = dg.get_span(tsg, node_id)
text = dg.get_text(tsg, node_id)
print node_id, span_tokens
print node_id, text, '\n'
span_onset, span_offset = dg.get_span_offsets(tsg, node_id)
print span_onset, span_offset
print [tsg.get_offsets(token) for token in span_tokens]
except:
pass
In [28]:
span_onset, span_offset = dg.get_span_offsets(tsg, 's389_501')
print span_onset, span_offset
span_range = xrange(span_onset, span_offset+1)
token_offsets = (tsg.get_offsets(tok) for tok in dg.get_span(tsg, 's389_501'))
char_positions = set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in token_offsets))
print char_positions
for item in span_range:
if item not in char_positions:
print item, False
break
In [37]:
def is_continuous(docgraph, dominating_node):
"""return True, if the tokens dominated by the given node are all adjacent"""
first_onset, last_offset = dg.get_span_offsets(docgraph, dominating_node)
span_range = xrange(first_onset, last_offset+1)
token_offsets = (docgraph.get_offsets(tok) for tok in dg.get_span(docgraph, dominating_node))
char_positions = set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in token_offsets))
for item in span_range:
if item not in char_positions:
return False
return True
In [36]:
is_continuous(tsg, 's389_504')
In [20]:
import itertools
l = [(0,3), (14, 19)]
list(itertools.chain.from_iterable(l))
Out[20]:
In [24]:
set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in l))
Out[24]:
In [ ]:
SENTENCE_WITH_SECEDGE = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s367" art_id="12976" orig_id="ID_maz-12976">
<graph root="s367_508">
<terminals>
<t id="s367_1" word="Es" lemma="--" pos="PPER" morph="--"></t>
<t id="s367_2" word="kann" lemma="--" pos="VMFIN" morph="--"></t>
<t id="s367_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"></t>
<t id="s367_4" word="sein" lemma="--" pos="VAINF" morph="--"></t>
<t id="s367_5" word="," lemma="--" pos="$," morph="--"></t>
<t id="s367_6" word="dass" lemma="--" pos="KOUS" morph="--">
<secedge label="CP" idref="s367_505"/></t>
<t id="s367_7" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_8" word="Basis" lemma="--" pos="NN" morph="--"></t>
<t id="s367_9" word="gewissermaßen" lemma="--" pos="ADV" morph="--"></t>
<t id="s367_10" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_11" word="Moral" lemma="--" pos="NN" morph="--"></t>
<t id="s367_12" word="pachtet" lemma="--" pos="VVFIN" morph="--"></t>
<t id="s367_13" word="und" lemma="--" pos="KON" morph="--"></t>
<t id="s367_14" word="ihn" lemma="--" pos="PPER" morph="--"></t>
<t id="s367_15" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_16" word="realpolitische" lemma="--" pos="ADJA" morph="--"></t>
<t id="s367_17" word="Schmutzarbeit" lemma="--" pos="NN" morph="--"></t>
<t id="s367_18" word="machen" lemma="--" pos="VVINF" morph="--"></t>
<t id="s367_19" word="lässt" lemma="--" pos="VVFIN" morph="--"></t>
<t id="s367_20" word="." lemma="--" pos="$." morph="--"></t>
</terminals>
<nonterminals>
<nt id="s367_500" cat="NP">
<edge label="NK" idref="s367_7"/>
<edge label="NK" idref="s367_8"/>
<secedge label="SB" idref="s367_505"/>
</nt>
<nt id="s367_501" cat="NP">
<edge label="NK" idref="s367_10"/>
<edge label="NK" idref="s367_11"/>
</nt>
<nt id="s367_502" cat="NP">
<edge label="NK" idref="s367_15"/>
<edge label="NK" idref="s367_16"/>
<edge label="NK" idref="s367_17"/>
</nt>
<nt id="s367_503" cat="S">
<edge label="CP" idref="s367_6"/>
<edge label="MO" idref="s367_9"/>
<edge label="HD" idref="s367_12"/>
<edge label="SB" idref="s367_500"/>
<edge label="OA" idref="s367_501"/>
</nt>
<nt id="s367_504" cat="VP">
<edge label="HD" idref="s367_18"/>
<edge label="OA" idref="s367_502"/>
</nt>
<nt id="s367_505" cat="S">
<edge label="OA" idref="s367_14"/>
<edge label="HD" idref="s367_19"/>
<edge label="OC" idref="s367_504"/>
</nt>
<nt id="s367_506" cat="CS">
<edge label="CD" idref="s367_13"/>
<edge label="CJ" idref="s367_503"/>
<edge label="CJ" idref="s367_505"/>
</nt>
<nt id="s367_507" cat="NP">
<edge label="PH" idref="s367_1"/>
<edge label="RE" idref="s367_506"/>
</nt>
<nt id="s367_508" cat="S">
<edge label="HD" idref="s367_2"/>
<edge label="NG" idref="s367_3"/>
<edge label="OC" idref="s367_4"/>
<edge label="SB" idref="s367_507"/>
</nt>
</nonterminals>
</graph>
</s>
"""
In [ ]:
secedge_etree = etree.fromstring(SENTENCE_WITH_SECEDGE)
In [ ]:
tsg_secedge = dg.readwrite.tiger.TigerSentenceGraph(secedge_etree)
In [ ]:
print dg.print_dot(tsg_secedge)
In [ ]:
# naive plot
# %dotstr dg.print_dot(tsg_secedge)
In [ ]:
fixed = u"""
digraph G {
graph [splines=ortho; nodesep=0.1]
// "VROOT-s367";
"s367_1" [label="Es"];
"s367_2" [label="kann"];
"s367_3" [label="nicht"];
"s367_4" [label="sein"];
"s367_5" [label=","];
"s367_6" [label="dass"];
"s367_7" [label="die"];
"s367_8" [label="Basis"];
"s367_9" [label="gewissermaßen"];
"s367_10" [label="die"];
"s367_11" [label="Moral"];
"s367_12" [label="pachtet"];
"s367_13" [label="und"];
"s367_14" [label="ihn"];
"s367_15" [label="die"];
"s367_16" [label="realpolitische"];
"s367_17" [label="Schmutzarbeit"];
"s367_18" [label="machen"];
"s367_19" [label="lässt"];
"s367_20" [label="."];
"s367_500" [label="NP"];
"s367_501" [label="NP"];
"s367_502" [label="NP"];
"s367_503" [label="S"];
"s367_504" [label="VP"];
"s367_505" [label="S"];
"s367_506" [label="CS"];
"s367_507" [label="NP"];
"s367_508" [label="S"];
"s367_508" -> "s367_507" [key=0, label="SB"];
"s367_507" -> "s367_506" [key=0, label="RE"];
"s367_506" -> "s367_503" [key=0, label="CJ"];
"s367_506" -> "s367_505" [key=0, label="CJ"];
"s367_505" -> "s367_504" [key=0, label="OC"];
"s367_504" -> "s367_502" [key=0, label="OA"];
"s367_503" -> "s367_501" [key=0, label="OA"];
"s367_503" -> "s367_500" [key=0, label="SB"];
"s367_508" -> "s367_2" [key=0, label="HD"];
"s367_508" -> "s367_3" [key=0, label="NG"];
"s367_508" -> "s367_4" [key=0, label="OC"];
"s367_507" -> "s367_1" [key=0, label="PH"];
"s367_506" -> "s367_13" [key=0, label="CD"];
"s367_505" -> "s367_19" [key=0, label="HD"];
"s367_505" -> "s367_14" [key=0, label="OA"];
"s367_504" -> "s367_18" [key=0, label="HD"];
"s367_503" -> "s367_6" [key=0, label="CP"];
"s367_503" -> "s367_12" [key=0, label="HD"];
"s367_503" -> "s367_9" [key=0, label="MO"];
"s367_502" -> "s367_16" [key=0, label="NK"];
"s367_502" -> "s367_17" [key=0, label="NK"];
"s367_502" -> "s367_15" [key=0, label="NK"];
"s367_501" -> "s367_11" [key=0, label="NK"];
"s367_501" -> "s367_10" [key=0, label="NK"];
"s367_500" -> "s367_7" [key=0, label="NK"];
"s367_500" -> "s367_8" [key=0, label="NK"];
// "VROOT-s367" -> "s367_5" [key=0];
// "VROOT-s367" -> "s367_508" [key=0];
// "VROOT-s367" -> "s367_20" [key=0];
edge [style="invis"];
{rank=sink; s367_1 -> s367_2 -> s367_3 -> s367_4 -> s367_5 -> s367_6 -> s367_7 -> s367_8 -> s367_9 -> s367_10 -> s367_11 -> s367_12 -> s367_13 -> s367_14 -> s367_15 -> s367_16 -> s367_17 -> s367_18 -> s367_19 -> s367_20;}
edge [style=""];
edge [style="invis"];
{rank=same; "s367_503"; "s367_504" ;}
edge [style=""];
edge [style="invis"];
{rank=same; "s367_500"; "s367_501" ; "s367_502";}
edge [style=""];
// "s367_6" -> "s367_505" [key=0, style=dotted]; // SECEDGE
// "s367_500" -> "s367_505" [key=0, label="NK", style=dotted]; // SECEDGE
}
"""
# %dotstr fixed
In [ ]:
for node_id in tsg_secedge:
if not dg.istoken(tsg_secedge, node_id):
span = dg.get_span(tsg_secedge, node_id)
text = dg.get_text(tsg_secedge, node_id)
print(node_id, span)
print(node_id, text)
In [ ]: