Problem: Node IDs in TigerXML aren't DFS ordered

  • in addition: crossing edges require different drawing strategy
    than regular trees

TODO: create proper dot representation of a TigerSentenceGraph


In [1]:
maz_13125_s389_str = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s389" art_id="13125" orig_id="ID_maz-13125">
<graph root="s389_503">
<terminals>
<t id="s389_1" word="Was" lemma="--" pos="PWS" morph="--"/>
<t id="s389_2" word="man" lemma="--" pos="PIS" morph="--"/>
<t id="s389_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"/>
<t id="s389_4" word="durch" lemma="--" pos="APPR" morph="--"/>
<t id="s389_5" word="Augenschein" lemma="--" pos="NN" morph="--"/>
<t id="s389_6" word="nachprüfen" lemma="--" pos="VVINF" morph="--"/>
<t id="s389_7" word="kann" lemma="--" pos="VMFIN" morph="--"/>
<t id="s389_8" word="," lemma="--" pos="$," morph="--"/>
<t id="s389_9" word="ist" lemma="--" pos="VAFIN" morph="--"/>
<t id="s389_10" word="manipulierbar" lemma="--" pos="ADJD" morph="--"/>
<t id="s389_11" word="." lemma="--" pos="$." morph="--"/>
</terminals>
<nonterminals>
<nt id="s389_500" cat="PP">
<edge label="AC" idref="s389_4"/>
<edge label="NK" idref="s389_5"/>
 </nt>
<nt id="s389_501" cat="VP">
<edge label="OA" idref="s389_1"/>
<edge label="HD" idref="s389_6"/>
<edge label="MO" idref="s389_500"/>
 </nt>
<nt id="s389_502" cat="S">
<edge label="SB" idref="s389_2"/>
<edge label="NG" idref="s389_3"/>
<edge label="HD" idref="s389_7"/>
<edge label="OC" idref="s389_501"/>
 </nt>
<nt id="s389_503" cat="S">
<edge label="HD" idref="s389_9"/>
<edge label="PD" idref="s389_10"/>
<edge label="SB" idref="s389_502"/>
 </nt>
</nonterminals>
</graph>
</s>
"""

In [2]:
from lxml import etree
import discoursegraphs as dg

maz_13125_s389 = etree.fromstring(maz_13125_s389_str)
tsg = dg.readwrite.tiger.TigerSentenceGraph(maz_13125_s389)

In [3]:
print dg.print_dot(tsg)


Couldn't import dot_parser, loading of dot files will not be possible.
digraph "" {
"s389_1" [label="Was"];
"s389_501" [label="VP"];
"s389_10" [label="manipulierbar"];
"s389_2" [label="man"];
"discoursegraph:root_node";
"s389_500" [label="PP"];
"s389_503" [label="S"];
"s389_502" [label="S"];
"s389_6" [label="nachprüfen"];
"s389_7" [label="kann"];
"s389_9" [label="ist"];
"s389_8" [label=","];
"s389_3" [label="nicht"];
"s389_5" [label="Augenschein"];
"s389_4" [label="durch"];
"s389_11" [label="."];
"s389_501" -> "s389_6"  [key=0, label="HD"];
"s389_501" -> "s389_1"  [key=0, label="OA"];
"s389_501" -> "s389_500"  [key=0, label="MO"];
"s389_500" -> "s389_5"  [key=0, label="NK"];
"s389_500" -> "s389_4"  [key=0, label="AC"];
"s389_503" -> "s389_10"  [key=0, label="PD"];
"s389_503" -> "s389_11"  [key=0];
"s389_503" -> "s389_502"  [key=0, label="SB"];
"s389_503" -> "s389_9"  [key=0, label="HD"];
"s389_503" -> "s389_8"  [key=0];
"s389_503" -> "discoursegraph:root_node"  [key=0];
"s389_502" -> "s389_2"  [key=0, label="SB"];
"s389_502" -> "s389_3"  [key=0, label="NG"];
"s389_502" -> "s389_501"  [key=0, label="OC"];
"s389_502" -> "s389_7"  [key=0, label="HD"];
}


In [4]:
# %load_ext gvmagic

In [5]:
# %dotstr dg.print_dot(tsg)

In [6]:
foo = u"""
digraph "" {

"discoursegraph:root_node";
"VROOT-s389";
"s389_500" [label="PP"];
"s389_501" [label="VP"];
"s389_502" [label="S"];
"s389_503" [label="S"];

"s389_1" [label="Was"];
"s389_2" [label="man"];
"s389_3" [label="nicht"];
"s389_4" [label="durch"];
"s389_5" [label="Augenschein"];
"s389_6" [label="nachprüfen"];
"s389_7" [label="kann"];
"s389_8" [label=","];
"s389_9" [label="ist"];
"s389_10" [label="manipulierbar"];
"s389_11" [label="."];

edge [style="invis"];
{rank=same; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];

"discoursegraph:root_node" -> "VROOT-s389" [key=0]; // changed direction

"s389_501" -> "s389_6"  [key=0, label="HD"];
"s389_501" -> "s389_1"  [key=0, label="OA"];
"s389_501" -> "s389_500"  [key=0, label="MO"];
"s389_500" -> "s389_5"  [key=0, label="NK"];
"s389_500" -> "s389_4"  [key=0, label="AC"];
"s389_503" -> "s389_9"  [key=0, label="HD"];
"s389_503" -> "s389_10"  [key=0, label="PD"];
"s389_503" -> "s389_502"  [key=0, label="SB"];
"s389_502" -> "s389_2"  [key=0, label="SB"];
"s389_502" -> "s389_3"  [key=0, label="NG"];
"s389_502" -> "s389_501"  [key=0, label="OC"];
"s389_502" -> "s389_7"  [key=0, label="HD"];
"VROOT-s389" -> "s389_503"  [key=0];

"VROOT-s389" -> "s389_8"  [key=0];
"VROOT-s389" -> "s389_11"  [key=0];
}
"""

# %dotstr foo

In [7]:
minimally_changed = u"""
digraph "" {

graph [splines=ortho; nodesep=0.1]

"s389_1" [label="Was"];
"s389_501" [label="VP"];
"s389_10" [label="manipulierbar"];
"s389_2" [label="man"];

"s389_500" [label="PP"];
"s389_503" [label="S"];
"s389_502" [label="S"];
"s389_6" [label="nachprüfen"];
"VROOT-s389";
"s389_7" [label="kann"];
"s389_9" [label="ist"];
"s389_8" [label=","];
"s389_3" [label="nicht"];
"s389_5" [label="Augenschein"];
"s389_4" [label="durch"];
"s389_11" [label="."];
"s389_501" -> "s389_6"  [key=0, label="HD"];
"s389_501" -> "s389_1"  [key=0, label="OA"];
"s389_501" -> "s389_500"  [key=0, label="MO"];
"s389_500" -> "s389_5"  [key=0, label="NK"];
"s389_500" -> "s389_4"  [key=0, label="AC"];
"s389_503" -> "s389_9"  [key=0, label="HD"];
"s389_503" -> "s389_10"  [key=0, label="PD"];
"s389_503" -> "s389_502"  [key=0, label="SB"];
"s389_502" -> "s389_2"  [key=0, label="SB"];
"s389_502" -> "s389_3"  [key=0, label="NG"];
"s389_502" -> "s389_501"  [key=0, label="OC"];
"s389_502" -> "s389_7"  [key=0, label="HD"];
"VROOT-s389" -> "s389_503"  [key=0];

"VROOT-s389" -> "s389_8"  [key=0];
"VROOT-s389" -> "s389_11"  [key=0];
    
edge [style="invis"];
{rank=sink; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];
}
"""

# %dotstr minimally_changed

TODO: why is there an outedge from VROOT to dg:root?

TODO: what do those cat nodes cover?


In [8]:
for node_id in tsg:
    if not dg.istoken(tsg, node_id):
        print node_id, dg.get_span(tsg, node_id)
        print node_id, dg.get_text(tsg, node_id), '\n'


s389_503 ['s389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7', 's389_8', 's389_9', 's389_10', 's389_11']
s389_503 Was man nicht durch Augenschein nachprüfen kann , ist manipulierbar . 

discoursegraph:root_node []
discoursegraph:root_node  

s389_501 ['s389_1', 's389_4', 's389_5', 's389_6']
s389_501 Was durch Augenschein nachprüfen 

s389_500 ['s389_4', 's389_5']
s389_500 durch Augenschein 

s389_502 ['s389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7']
s389_502 Was man nicht durch Augenschein nachprüfen kann 

TODO: implement is_discontinuous()


In [9]:
for node_id in tsg:
    try:
        if not dg.istoken(tsg, node_id):
            span_tokens = dg.get_span(tsg, node_id)
            text = dg.get_text(tsg, node_id)
            print node_id, span_tokens
            print node_id, text, '\n'

            span_onset, span_offset = dg.get_span_offsets(tsg, node_id)
            print span_onset, span_offset
            print [tsg.get_offsets(token) for token in span_tokens]
    except:
        pass


s389_503 ['s389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7', 's389_8', 's389_9', 's389_10', 's389_11']
s389_503 Was man nicht durch Augenschein nachprüfen kann , ist manipulierbar . 

0 69
[(0, 3), (4, 7), (8, 13), (14, 19), (20, 31), (32, 42), (43, 47), (48, 49), (50, 53), (54, 67), (68, 69)]
discoursegraph:root_node []
discoursegraph:root_node  

s389_501 ['s389_1', 's389_4', 's389_5', 's389_6']
s389_501 Was durch Augenschein nachprüfen 

0 42
[(0, 3), (14, 19), (20, 31), (32, 42)]
s389_500 ['s389_4', 's389_5']
s389_500 durch Augenschein 

14 31
[(14, 19), (20, 31)]
s389_502 ['s389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7']
s389_502 Was man nicht durch Augenschein nachprüfen kann 

0 47
[(0, 3), (4, 7), (8, 13), (14, 19), (20, 31), (32, 42), (43, 47)]

In [28]:
span_onset, span_offset = dg.get_span_offsets(tsg, 's389_501')
print span_onset, span_offset
span_range = xrange(span_onset, span_offset+1)

token_offsets = (tsg.get_offsets(tok) for tok in dg.get_span(tsg, 's389_501'))
char_positions = set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in token_offsets))
print char_positions
for item in span_range:
    if item not in char_positions:
        print item, False
        break


0 42
set([0, 1, 2, 3, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42])
4 False

In [37]:
def is_continuous(docgraph, dominating_node):
    """return True, if the tokens dominated by the given node are all adjacent"""
    first_onset, last_offset = dg.get_span_offsets(docgraph, dominating_node)
    span_range = xrange(first_onset, last_offset+1)

    token_offsets = (docgraph.get_offsets(tok) for tok in dg.get_span(docgraph, dominating_node))
    char_positions = set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in token_offsets))
    for item in span_range:
        if item not in char_positions:
            return False
    return True

In [36]:
is_continuous(tsg, 's389_504')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-36-f4dea28aae28> in <module>()
----> 1 is_continuous(tsg, 's389_504')

<ipython-input-32-060ee29c7b25> in is_continuous(docgraph, dominating_node)
      1 def is_continuous(docgraph, dominating_node):
----> 2     first_onset, last_offset = dg.get_span_offsets(docgraph, dominating_node)
      3     span_range = xrange(first_onset, last_offset+1)
      4 
      5     token_offsets = (docgraph.get_offsets(tok) for tok in dg.get_span(docgraph, dominating_node))

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/discoursegraph.pyc in get_span_offsets(docgraph, node_id)
    927         return (min(onsets), max(offsets))
    928     except KeyError as _:
--> 929         raise KeyError("Node '{}' doesn't span any tokens.".format(node_id))
    930 
    931 

KeyError: "Node 's389_504' doesn't span any tokens."

In [20]:
import itertools

l = [(0,3), (14, 19)]
list(itertools.chain.from_iterable(l))


Out[20]:
[0, 3, 14, 19]

In [24]:
set(itertools.chain.from_iterable(xrange(on, off+1) for on, off in l))


Out[24]:
{0, 1, 2, 3, 14, 15, 16, 17, 18, 19}

Proposed solution

get rid of VROOT

attach all unconnected nodes to

WRONG: make get_span(), get_text() more aggressive

  • current: s389_501: ['s389_1', 's389_4', 's389_5', 's389_6']
  • should: s389_501: ['s389_1', .. 's389_7']
  • I was wrong, the VP here is discontinuous. I hate Tiger!

In [ ]:
SENTENCE_WITH_SECEDGE = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s367" art_id="12976" orig_id="ID_maz-12976">
<graph root="s367_508">
<terminals>
<t id="s367_1" word="Es" lemma="--" pos="PPER" morph="--"></t>
<t id="s367_2" word="kann" lemma="--" pos="VMFIN" morph="--"></t>
<t id="s367_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"></t>
<t id="s367_4" word="sein" lemma="--" pos="VAINF" morph="--"></t>
<t id="s367_5" word="," lemma="--" pos="$," morph="--"></t>
<t id="s367_6" word="dass" lemma="--" pos="KOUS" morph="--">
<secedge label="CP" idref="s367_505"/></t>
<t id="s367_7" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_8" word="Basis" lemma="--" pos="NN" morph="--"></t>
<t id="s367_9" word="gewissermaßen" lemma="--" pos="ADV" morph="--"></t>
<t id="s367_10" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_11" word="Moral" lemma="--" pos="NN" morph="--"></t>
<t id="s367_12" word="pachtet" lemma="--" pos="VVFIN" morph="--"></t>
<t id="s367_13" word="und" lemma="--" pos="KON" morph="--"></t>
<t id="s367_14" word="ihn" lemma="--" pos="PPER" morph="--"></t>
<t id="s367_15" word="die" lemma="--" pos="ART" morph="--"></t>
<t id="s367_16" word="realpolitische" lemma="--" pos="ADJA" morph="--"></t>
<t id="s367_17" word="Schmutzarbeit" lemma="--" pos="NN" morph="--"></t>
<t id="s367_18" word="machen" lemma="--" pos="VVINF" morph="--"></t>
<t id="s367_19" word="lässt" lemma="--" pos="VVFIN" morph="--"></t>
<t id="s367_20" word="." lemma="--" pos="$." morph="--"></t>
</terminals>
<nonterminals>
<nt id="s367_500" cat="NP">
<edge label="NK" idref="s367_7"/>
<edge label="NK" idref="s367_8"/>
<secedge label="SB" idref="s367_505"/>
 </nt>
<nt id="s367_501" cat="NP">
<edge label="NK" idref="s367_10"/>
<edge label="NK" idref="s367_11"/>
 </nt>
<nt id="s367_502" cat="NP">
<edge label="NK" idref="s367_15"/>
<edge label="NK" idref="s367_16"/>
<edge label="NK" idref="s367_17"/>
 </nt>
<nt id="s367_503" cat="S">
<edge label="CP" idref="s367_6"/>
<edge label="MO" idref="s367_9"/>
<edge label="HD" idref="s367_12"/>
<edge label="SB" idref="s367_500"/>
<edge label="OA" idref="s367_501"/>
 </nt>
<nt id="s367_504" cat="VP">
<edge label="HD" idref="s367_18"/>
<edge label="OA" idref="s367_502"/>
 </nt>
<nt id="s367_505" cat="S">
<edge label="OA" idref="s367_14"/>
<edge label="HD" idref="s367_19"/>
<edge label="OC" idref="s367_504"/>
 </nt>
<nt id="s367_506" cat="CS">
<edge label="CD" idref="s367_13"/>
<edge label="CJ" idref="s367_503"/>
<edge label="CJ" idref="s367_505"/>
 </nt>
<nt id="s367_507" cat="NP">
<edge label="PH" idref="s367_1"/>
<edge label="RE" idref="s367_506"/>
 </nt>
<nt id="s367_508" cat="S">
<edge label="HD" idref="s367_2"/>
<edge label="NG" idref="s367_3"/>
<edge label="OC" idref="s367_4"/>
<edge label="SB" idref="s367_507"/>
 </nt>
</nonterminals>
</graph>
</s>
"""

In [ ]:
secedge_etree = etree.fromstring(SENTENCE_WITH_SECEDGE)

In [ ]:
tsg_secedge = dg.readwrite.tiger.TigerSentenceGraph(secedge_etree)

In [ ]:
print dg.print_dot(tsg_secedge)

In [ ]:
# naive plot
# %dotstr dg.print_dot(tsg_secedge)

In [ ]:
fixed = u"""

digraph G {
graph [splines=ortho; nodesep=0.1]

// "VROOT-s367";

"s367_1" [label="Es"];
"s367_2" [label="kann"];
"s367_3" [label="nicht"];
"s367_4" [label="sein"];
"s367_5" [label=","];
"s367_6" [label="dass"];
"s367_7" [label="die"];
"s367_8" [label="Basis"];
"s367_9" [label="gewissermaßen"];
"s367_10" [label="die"];
"s367_11" [label="Moral"];
"s367_12" [label="pachtet"];
"s367_13" [label="und"];
"s367_14" [label="ihn"];
"s367_15" [label="die"];
"s367_16" [label="realpolitische"];
"s367_17" [label="Schmutzarbeit"];
"s367_18" [label="machen"];
"s367_19" [label="lässt"];
"s367_20" [label="."];

"s367_500" [label="NP"];
"s367_501" [label="NP"];
"s367_502" [label="NP"];
"s367_503" [label="S"];
"s367_504" [label="VP"];
"s367_505" [label="S"];
"s367_506" [label="CS"];
"s367_507" [label="NP"];
"s367_508" [label="S"];

"s367_508" -> "s367_507"  [key=0, label="SB"];
"s367_507" -> "s367_506"  [key=0, label="RE"];
"s367_506" -> "s367_503"  [key=0, label="CJ"];
"s367_506" -> "s367_505"  [key=0, label="CJ"];
"s367_505" -> "s367_504"  [key=0, label="OC"];
"s367_504" -> "s367_502"  [key=0, label="OA"];
"s367_503" -> "s367_501"  [key=0, label="OA"];
"s367_503" -> "s367_500"  [key=0, label="SB"];


"s367_508" -> "s367_2"  [key=0, label="HD"];
"s367_508" -> "s367_3"  [key=0, label="NG"];
"s367_508" -> "s367_4"  [key=0, label="OC"];

"s367_507" -> "s367_1"  [key=0, label="PH"];

"s367_506" -> "s367_13"  [key=0, label="CD"];

"s367_505" -> "s367_19"  [key=0, label="HD"];
"s367_505" -> "s367_14"  [key=0, label="OA"];

"s367_504" -> "s367_18"  [key=0, label="HD"];

"s367_503" -> "s367_6"  [key=0, label="CP"];
"s367_503" -> "s367_12"  [key=0, label="HD"];
"s367_503" -> "s367_9"  [key=0, label="MO"];

"s367_502" -> "s367_16"  [key=0, label="NK"];
"s367_502" -> "s367_17"  [key=0, label="NK"];
"s367_502" -> "s367_15"  [key=0, label="NK"];

"s367_501" -> "s367_11"  [key=0, label="NK"];
"s367_501" -> "s367_10"  [key=0, label="NK"];

"s367_500" -> "s367_7"  [key=0, label="NK"];
"s367_500" -> "s367_8"  [key=0, label="NK"];


// "VROOT-s367" -> "s367_5"  [key=0];
// "VROOT-s367" -> "s367_508"  [key=0];
// "VROOT-s367" -> "s367_20"  [key=0];


edge [style="invis"];
{rank=sink; s367_1 -> s367_2 -> s367_3 -> s367_4 -> s367_5 -> s367_6 -> s367_7 -> s367_8 -> s367_9 -> s367_10 -> s367_11 -> s367_12 -> s367_13 -> s367_14 -> s367_15 -> s367_16 -> s367_17 -> s367_18 -> s367_19 -> s367_20;}
edge [style=""];

edge [style="invis"];
{rank=same; "s367_503"; "s367_504" ;}
edge [style=""];

edge [style="invis"];
{rank=same; "s367_500"; "s367_501" ; "s367_502";}
edge [style=""];


// "s367_6" -> "s367_505"  [key=0, style=dotted]; // SECEDGE
// "s367_500" -> "s367_505"  [key=0, label="NK", style=dotted]; // SECEDGE


}
"""

# %dotstr fixed

In [ ]:
for node_id in tsg_secedge:
    if not dg.istoken(tsg_secedge, node_id):
        span = dg.get_span(tsg_secedge, node_id)
        text = dg.get_text(tsg_secedge, node_id)
  
        print(node_id, span)
        print(node_id, text)

TODO: implement is_discontinuous()


In [ ]: