Task: match syntax spans with RST spans

Subtask 1: get all syntax spans (with string/token offsets)


In [1]:
import os
import discoursegraphs as dg

In [2]:
ddg = dg.corpora.pcc.get_document('maz-6728')

In [3]:
dg.DATA_ROOT_DIR


Out[3]:
'/home/arne/repos/discoursegraphs/src/discoursegraphs/data'

In [4]:
# dg.corpora.pcc.get_files_by_layer('syntax')
tdg = dg.read_tiger(os.path.join(
        dg.DATA_ROOT_DIR,
        'potsdam-commentary-corpus-2.0.0/syntax/maz-11766.xml'))

In [5]:
# dg.info(tdg)

In [6]:
# %load_ext gvmagic

In [7]:
# %dotstr dg.print_dot(tdg)

In [8]:
syntax_nodes = set(dg.select_nodes_by_layer(ddg, 'tiger:syntax'))

In [9]:
cat_nodes = set(dg.select_nodes_by_attribute(ddg, 'tiger:cat'))

In [10]:
# token_map = dg.util.TokenMapper(ddg)

In [11]:
# token_map.id2index['rst:rst:9_0']

In [25]:
token_map = dg.util.TokenMapper(ddg)

cat_spans = {}
for cat_node in cat_nodes:
#     print ddg.node[cat_node]['tiger:cat'], dg.get_span_offsets(ddg, cat_node)
    first_token_index = token_map.id2index[dg.get_span(ddg, cat_node)[0]]
    last_token_index = token_map.id2index[dg.get_span(ddg, cat_node)[-1]]
    
#     print ddg.node[cat_node]['tiger:cat'], token_map.id2index[first_token_id], token_map.id2index[last_token_id]
    cat_spans[(first_token_index, last_token_index)] = {
        'node': cat_node, 'cat': ddg.node[cat_node]['tiger:cat']}

In [26]:
def get_syntax_spans(docgraph):
    token_map = dg.util.TokenMapper(docgraph)
    cat_nodes = set(dg.select_nodes_by_attribute(docgraph, 'tiger:cat'))

    syntax_spans = {}
    for cat_node in cat_nodes:
        first_token_index = token_map.id2index[dg.get_span(docgraph, cat_node)[0]]
        last_token_index = token_map.id2index[dg.get_span(docgraph, cat_node)[-1]]

        syntax_spans[(first_token_index, last_token_index)] = {
            'node': cat_node, 'cat': docgraph.node[cat_node]['tiger:cat']}

    return syntax_spans

In [19]:
# cat_spans

In [34]:
def get_rst_span_map(docgraph):
    rst_spans = {}
    for rel_id, span_type, rel_type, tok_onset, tok_offset in dg.readwrite.rst.rs3.get_rst_spans(docgraph):
        rst_spans[(tok_onset, tok_offset)] = {
            'rel_id': rel_id, 'span_type': span_type, 'rel_type': rel_type}
    
    return rst_spans

There are only very few RST spans that match to non-S/CS nodes


In [35]:
for doc_id in dg.corpora.pcc.document_ids:
    ddg = dg.corpora.pcc.get_document(doc_id)
    syntax_spans = get_syntax_spans(ddg)
    
    for syntax_span in syntax_spans:
        if syntax_spans[syntax_span]['cat'] not in ('S', 'CS'):
            if syntax_span in rst_spans:
                print doc_id, syntax_span, syntax_spans[syntax_span]['cat']


maz-11507 (103, 111) VP
maz-12473 (31, 41) PP
maz-13915 (148, 161) VP
maz-16360 (31, 41) NP
maz-16590 (82, 89) PP
maz-18377 (124, 147) CVP
maz-19436 (162, 167) NP
maz-3073 (72, 89) VP
maz-5709 (162, 167) PP

In [42]:
from collections import Counter

cat_counter = Counter()
subord_counter = Counter()

for doc_id in dg.corpora.pcc.document_ids:
    ddg = dg.corpora.pcc.get_document(doc_id)
    syntax_spans = get_syntax_spans(ddg)
    
    subord_nodes = get_subordinate_clauses(ddg)
    
    for syntax_span in syntax_spans:
        if syntax_span in rst_spans:
#             print doc_id, syntax_span, syntax_spans[syntax_span]['cat']
            cat = syntax_spans[syntax_span]['cat']
            cat_counter[cat] += 1
        
            if syntax_spans[syntax_span]['node'] in subord_nodes:
                subord_counter[cat] += 1

In [44]:
cat_counter.most_common()


Out[44]:
[('S', 17), ('PP', 3), ('VP', 3), ('CS', 2), ('NP', 2), ('CVP', 1)]

In [45]:
subord_counter


Out[45]:
Counter({'PP': 2, 'S': 1})

Are there any 'S'/'CS' that are not sentence root nodes?

  • subordinate clauses etc.

In [36]:
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.

    Parameters
    ----------
    tiger_docgraph : dg.DiscourseDocumentGraph
        document graph from which subordinate clauses will be extracted
    
    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])
    
    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes

In [39]:
for node_id in get_subordinate_clauses(ddg):
    print node_id, ddg.node[node_id]


s2158_509 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'S', 'tiger:id': 's2158_509', 'label': 'S'}
s2158_501 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'AP', 'tiger:id': 's2158_501', 'label': 'AP'}
s2159_507 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'PP', 'tiger:id': 's2159_507', 'label': 'PP'}
s2162_501 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'PP', 'tiger:id': 's2162_501', 'label': 'PP'}
s2152_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2152_500', 'label': 'NP'}
s2153_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2153_500', 'label': 'NP'}
s2154_502 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2154_502', 'label': 'NP'}
s2156_502 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2156_502', 'label': 'NP'}
s2158_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2158_500', 'label': 'NP'}
s2159_503 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2159_503', 'label': 'NP'}
s2159_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2159_500', 'label': 'NP'}
s2160_502 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'S', 'tiger:id': 's2160_502', 'label': 'S'}
s2160_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2160_500', 'label': 'NP'}
s2161_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2161_500', 'label': 'NP'}
s2162_504 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2162_504', 'label': 'NP'}
s2162_502 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'CNP', 'tiger:id': 's2162_502', 'label': 'CNP'}
s2163_500 {'layers': set(['tiger:syntax', 'tiger']), 'tiger:cat': 'NP', 'tiger:id': 's2163_500', 'label': 'NP'}

In [ ]:


In [18]:
# rst_spans

In [24]:
for cat_span in cat_spans:
    if cat_span in rst_spans:
        print cat_span, cat_spans[cat_span][1], rst_spans[cat_span][1:]


(51, 71) S ['N', 'reason']
(103, 123) S ['S', 'elaboration']
(72, 89) S ['S', 'reason']
(162, 178) CS ['S', 'summary']
(10, 50) S ['S', 'cause']
(1, 9) S ['N', 'cause']
(148, 161) S ['S', 'reason']
(90, 102) S ['N', 'elaboration']
(124, 147) S ['N', 'reason']

In [ ]: