RST and syntax:

Task 1: Which syntactic subordinate clauses match with an RST EDU?

Task 2: Are these EDUs satellites or nucleii?

Task 3: does this correlate with certain RST relation types?

neo4j query

MATCH ()-[r1]->(p)-[*]->(c)<-[:LINKS_TO {layers: ["rst", "rst:token"], edge_type: "spans"}]-(rst)

WHERE p.tiger:cat = "S" AND (r1.label = "MO" OR r1.label = "SB" OR r1.label = "RC") AND EXISTS(c.tiger:token) AND EXISTS(rst.rst:text)

RETURN p,c,rst ORDER BY c.tiger:id;


In [1]:
import discoursegraphs as dg
from discoursegraphs.corpora import pcc

In [2]:
pcc.get_files_by_document_id('maz-17673')


Out[2]:
['/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/rst/maz-17673.rs3',
 '/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/connectors/maz-17673.xml',
 '/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/coreference/maz-17673.mmax',
 '/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/syntax/maz-17673.xml']

In [3]:
tdg = dg.read_tiger('/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/syntax/maz-17673.xml')

In [4]:
rdg = dg.read_rs3('/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/rst/maz-17673.rs3')

In [5]:
# dg.info(tdg)

In [6]:
tdg.node['s999_506']


Out[6]:
{'label': 'VP',
 'layers': {'tiger', 'tiger:syntax'},
 'tiger:cat': 'VP',
 'tiger:id': 's999_506'}

In [7]:
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])
    
    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes

In [8]:
tdg.merge_graphs(rdg)

In [9]:
from discoursegraphs.readwrite.rst import rs3

Task 1: Which syntactic subordinate clauses match with an RST EDU?


In [10]:
subord_offsets = [dg.get_span_offsets(tdg, sc)
                  for sc in get_subordinate_clauses(tdg)]

In [11]:
edu_offsets = [dg.get_span_offsets(tdg, edu) for edu in rs3.get_edus(tdg)]

Problem: EDU and subordinate clause may cover the same clause with(out) punctuation

  • subordinate clause: (318, 366)
  • EDU: (318, 368)

In [12]:
set(edu_offsets).intersection(set(subord_offsets))


Out[12]:
set()

In [13]:
dg.get_text(tdg)[318:366] # subordinate clause


Out[13]:
u'um m\xf6gliche Gewaltt\xe4ter von ihrem Tun abzuhalten'

In [14]:
dg.get_text(tdg)[318:368] # EDU


Out[14]:
u'um m\xf6gliche Gewaltt\xe4ter von ihrem Tun abzuhalten .'

variant 1: EDU entails subordinate clause


In [15]:
for e_on, e_off in edu_offsets:
    for s_on, s_off in subord_offsets:
        if e_on <= s_on and s_off <= e_off:
            share = (s_off - s_on) / float(e_off - e_on)
            if share >= 0.9:
                print (e_on, e_off), "includes", (s_on, s_off), "share: ", share


(196, 368) includes (203, 366) share:  0.947674418605
(318, 368) includes (318, 366) share:  0.96
(369, 540) includes (376, 538) share:  0.947368421053
(487, 540) includes (487, 538) share:  0.962264150943
(512, 540) includes (512, 538) share:  0.928571428571
(550, 597) includes (550, 595) share:  0.957446808511

variant 2: subordinate clause entails EDU


In [16]:
for s_on, s_off in subord_offsets:
    for e_on, e_off in edu_offsets:
        if s_on <= e_on and e_off <= s_off:
            share = float(e_off - e_on) / (s_off - s_on)
            if share >= 0.9:
                print (s_on, s_off), "includes", (e_on, e_off), "share: ", share

subtask: allow queries for overlapping spans


In [17]:
dg.get_text(tdg)[203:366]


Out[17]:
u'die Zeiten , in denen ein Einsatzbus mit mehreren Beamten auf dem Parkplatz stundenlang Position beziehen musste , um m\xf6gliche Gewaltt\xe4ter von ihrem Tun abzuhalten'

In [18]:
dg.get_text(tdg)[196:368]


Out[18]:
u'Vorbei die Zeiten , in denen ein Einsatzbus mit mehreren Beamten auf dem Parkplatz stundenlang Position beziehen musste , um m\xf6gliche Gewaltt\xe4ter von ihrem Tun abzuhalten .'

In [140]:
from operator import itemgetter

def max_overlap(overlapping_nodes):
    """
    given a list of nodes, return the node ID of the
    best matching node.
    
    Parameters
    ----------
    overlapping_nodes : list of dict
        A list of nodes. Each node is represented as a
        dict with two keys: ``node`` (the node ID) and
        ``overlap`` (percentage over overlap with another
        string)
    
    Returns
    -------
    max_overlap : dict
        dict containing ``node_id`` (of the node
        with the highest overlap), as well as some
        metadata (e.g. ``overlap`` in percent, ``interval``
        (string onset int, string offset int))
    """
    return sorted(overlapping_nodes,
                       key=lambda n: n['overlap'],
                       reverse=True)[0]

In [141]:
from collections import defaultdict

from intervaltree import IntervalTree, Interval

import discoursegraphs as dg
from discoursegraphs.readwrite.rst import rs3


def find_overlapping_nodes(
    docgraph, local_nodes, other_nodes,
    overlap_threshold=95, debug=False):
    """
    """
    # there might be more than one node covering the same span
    local_span2nodes = defaultdict(list)
    for local_node in local_nodes:
        span = dg.get_span_offsets(docgraph, local_node)
        local_span2nodes[span].append(local_node)
        
    other_span2nodes = defaultdict(list)
    for other_node in other_nodes:
        span = dg.get_span_offsets(docgraph, other_node)
        other_span2nodes[span].append(other_node)

    other_tree = IntervalTree.from_tuples(other_span2nodes.keys())
    overlap_map = defaultdict(list)

    for local_span in local_span2nodes:
        # all the spans from ``other_spans`` that overlap with this ``local_span``
        overlap_intervals = other_tree[Interval(*local_span)]
        for overlap_interval in overlap_intervals:
            local_on, local_off = local_span
            len_local = local_off - local_on

            other_on, other_off = overlap_interval.begin, overlap_interval.end
            len_other = other_off - other_on
            len_longest_input = max(len_local, len_other)
            
            overlap_on = max(local_on, other_on)
            overlap_off = min(local_off, other_off)
            len_overlap = overlap_off - overlap_on

            # overlap in % between the input intervals
            overlap = len_overlap / float(len_longest_input) * 100

            if overlap >= overlap_threshold or (len_overlap+2 >= len_longest_input):
                # generate a mapping from a local node to all
                # the ``other_nodes`` it overlaps with (incl.
                # their overlap in % for finding the best match)
                for local_node in local_span2nodes[local_span]:
                    overlap_span = (overlap_interval.begin, overlap_interval.end)
                    for other_node in other_span2nodes[overlap_span]:
                        overlap_map[local_node].append(
                            {'node_id': other_node, 'overlap': overlap,
                             'interval': (other_on, other_off)})
                
                if debug:
                    print local_span, overlap_interval, overlap
    
    return {local_node: max_overlap(overlap_map[local_node])
            for local_node in overlap_map}

In [142]:
subord_nodes = get_subordinate_clauses(tdg)
edu_nodes =  rs3.get_edus(tdg)

overlapping_nodes_map = find_overlapping_nodes(
    tdg, subord_nodes, edu_nodes, overlap_threshold=100)

Task 1: Which syntactic subordinate clauses match with an RST EDU?


In [178]:
for subord_clause in overlapping_nodes_map:
#     print subord_clause
    print tdg.node[subord_clause]['tiger:cat'], dg.get_text(tdg, subord_clause)

#     other_node = overlapping_nodes_map[subord_clause]
#     other_node_id = other_node['node_id']
#     print other_node, dg.get_text(tdg, other_node_id)
#     print tdg.node[other_node_id]['rst:segment_type'], '\n'


VP ohne straffällig zu werden
S indem sie provozierten ohne straffällig zu werden
S Wer sich jetzt etwas zu Schulden kommen lässt
VP um mögliche Gewalttäter von ihrem Tun abzuhalten

Task 2: Are these EDUs satellites or nucleii?

Task 3: does this correlate with certain RST relation types?


In [183]:
for edu in overlapping_nodes_map.values():
    edu_node_id = edu['node_id']
    segment_type = tdg.node[edu_node_id]['rst:segment_type']
    
    if segment_type == 'nucleus':
        rel_name = tdg.node[edu_node_id]['rst:rel_name']
    else:
        in_edges = tdg.in_edges(edu_node_id)
        assert len(in_edges) == 1, \
            "There must be exactly one dominating node."
        dom_node_id = in_edges[0][0]
        rel_name = tdg.node[dom_node_id]['rst:rel_name']
    print edu_node_id, segment_type, rel_name


rst:17 satellite circumstance
rst:16 nucleus circumstance
rst:6 satellite condition
rst:15 satellite purpose

In [ ]: