Generate token span ranges for RST segments and Conano (connector) int/ext units


In [2]:
import os
import itertools

import pandas as pd
from discoursegraphs.readwrite import RSTGraph, ConanoDocumentGraph
from discoursegraphs.readwrite.rst import get_rst_relation_root_nodes, get_rst_relations, get_rst_spans
from discoursegraphs.util import TokenMapper

PCC_ROOTDIR = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/')
RST_DIR = os.path.join(PCC_ROOTDIR, 'rst')
CONANO_DIR = os.path.join(PCC_ROOTDIR, 'connectors')

In [4]:
rst_graph = RSTGraph(os.path.join(RST_DIR, 'maz-00001.rs3'))
rst_graph_untokenized = RSTGraph(os.path.join(RST_DIR, 'maz-00001.rs3'), tokenize=False)
# get_segment_spans_from_rst_relation(rst_graph, 'rst:40')

In [5]:
rst_files = !ls $RST_DIR/*.rs3
conano_files = !ls $CONANO_DIR/*.xml

In [6]:
def get_segment_token_offsets(segment_token_list, token_map):
    """
    Parameters
    ----------
    segment_token_list : list of str
        sorted list of token IDs (i.e. the tokens
        that this segment spans)
    token_mapper : dict of (str, int)
        a map from token IDs to token indices
    
    Returns
    -------
    first_token_index : int
        index of the first token of the segment
    last_token_index : int
        index of the last token of the segment
    """
    return token_map[segment_token_list[0]], token_map[segment_token_list[-1]]

In [7]:
def get_rst_spans(docgraph):
    """
    """
    token_map = TokenMapper(docgraph).id2index

    for dom_node_id, rel_name, token_ids in get_rst_relations(docgraph):
        segments = get_segment_spans_from_rst_relation(docgraph, dom_node_id)
        for segment_pair in itertools.combinations(segments, 2):
            for seg_id in segment_pair:
                start, end = get_segment_token_offsets(segments[seg_id], token_map)
                yield (dom_node_id+':'+seg_id, rel_name, start, end)

In [8]:
from discoursegraphs.readwrite.conano import get_conano_units, get_connective

def get_conano_spans(docgraph):
    """
    """
    token_map = TokenMapper(docgraph).id2index

    for unit_id, token_ids in get_conano_units(docgraph):
        first_tok, last_tok = get_segment_token_offsets(token_ids, token_map)
        yield (unit_id, get_connective(docgraph, unit_id),
               first_tok, last_tok)

In [9]:
from discoursegraphs import select_nodes_by_layer

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    doc_id = rst_graph.name.split('.')[0]
#     conano_graph = ConanoDocumentGraph(os.path.join(CONANO_DIR, doc_id+'.xml'))
    
#     rst_spans = get_rst_spans(rst_graph)
#     try:
#         rst_span_df = pd.DataFrame(rst_spans, columns=['segment-id', 'relation-name', 'start-token', 'end-token'])
#         rst_span_df.to_csv('/tmp/{}_rst_spans.csv'.format(doc_id), index=False)
#     except Exception as e:
#         print "{}: contains wrong RST spans".format(doc_id), e

#     try:
#         conano_spans = get_conano_spans(conano_graph)
#         conano_span_df = pd.DataFrame(conano_spans, columns=['unit-id', 'connective', 'start-token', 'end-token'])
#         conano_span_df.to_csv('/tmp/{}_conano_spans.csv'.format(doc_id), index=False, encoding="utf-8")
#     except Exception as e:
#         print "{}: contains wrong Conano spans".format(doc_id), e

Test RST assumptions

are all segment dom_nodes from get_rst_relations() nucleii?


In [13]:
from collections import Counter

dom_segment_types = Counter()

for rst_file in rst_files[:2]:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relation_root_nodes(rst_graph):
        if 'rst:segment' in rst_graph.node[dom_node]['layers']:
            dom_segment_types[rst_graph.node[dom_node]['rst:segment_type']] += 1
            if rst_graph.node[dom_node]['rst:segment_type'] == 'span':
                print os.path.basename(rst_file), dom_node


maz-00002.rs3 rst:16
maz-00002.rs3 rst:18

In [ ]:
dom_segment_types # nuc and span

No, lots of them are 'span' nodes

are all group dom_nodes from get_rst_relations() nucleii?


In [14]:
dom_segment_types = Counter()

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relation_root_nodes(rst_graph):
        if 'rst:group' in rst_graph.node[dom_node]['layers']:
            dom_segment_types[rst_graph.node[dom_node]['rst:segment_type']] += 1

In [15]:
dom_segment_types


Out[15]:
Counter({'nucleus': 954, 'satellite': 230})

No.

are there dom_nodes that dominate only one node, which is a span?


In [ ]:
dominated_segment_types = Counter()

for rst_file in rst_files[:1]:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relations(rst_graph):
        if 'rst:group' in rst_graph.node[dom_node]['layers']:
            for neighbor, neighbor_attrs in select_neighbors_by_layer(rst_graph,
                                                                      dom_node,
                                                                      layer={'rst:segment', 'rst:group'},
                                                                      data=True):
                dominated_segment_types[neighbor_attrs['rst:segment_type']] += 1
                if neighbor_attrs['rst:segment_type'] == 'span':
                    print os.path.basename(rst_file), dom_node, relname

In [ ]:
dominated_segment_types

Yes, unfortunately!

do all segment dom_nodes dominate exactly one group/segment?


In [ ]:
from collections import defaultdict
import os
from discoursegraphs import select_neighbors_by_layer

dominated_segment_count = defaultdict(list)

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relations(rst_graph):
        if 'rst:segment' in rst_graph.node[dom_node]['layers']:
            rst_neighbors = list(select_neighbors_by_layer(rst_graph, dom_node, layer={'rst:segment', 'rst:group'}))
            dominated_segment_count[len(rst_neighbors)].append( (os.path.basename(rst_file), dom_node) )

In [ ]:
dominated_segment_count.keys()

Noooooooo!


In [ ]:
for dom_num in dominated_segment_count:
    print dom_num, "count: ", len(dominated_segment_count[dom_num])

In [ ]:
dominated_segment_count[2][0]

In [ ]:
DOC_ID = 'maz-10205'
NODE_ID = 'rst:5'
rst_graph = RSTGraph(os.path.join(RST_DIR, DOC_ID+'.rs3'))

print rst_graph.node[NODE_ID], '\n\n'
print list(select_neighbors_by_layer(rst_graph, NODE_ID, layer={'rst:segment', 'rst:group'}))

for dominated_node in rst_graph[NODE_ID]:
    print dominated_node, rst_graph.node[dominated_node]['layers']

does each RST dominating node have max. 1 outgoing spanning relation?


In [ ]:
from discoursegraphs import EdgeTypes

dominated_span_count = Counter()

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relations(rst_graph, data=True):
#         print dom_node, relname
        span_neighbors = []
        for neighbor in select_neighbors_by_layer(rst_graph, dom_node, layer={'rst:segment', 'rst:group'}):
            for edge in rst_graph.edge[dom_node][neighbor]:  # this is a multi-digraph
                if rst_graph.edge[dom_node][neighbor][edge]['edge_type'] == EdgeTypes.spanning_relation:
                    span_neighbors.append(neighbor)
        dominated_span_count[len(span_neighbors)] += 1

In [ ]:
dominated_span_count

RST segments/groups with empty segment_type or segment_type 'span'?


In [ ]:
from collections import Counter

segtype_counter = Counter()

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    for node_id, node_attrs in select_nodes_by_layer(rst_graph, layer={'rst:segment', 'rst:group'}, data=True):
        segtype_counter[node_attrs['rst:segment_type']] += 1
#             print os.path.basename(rst_file), node_id, node_attrs

In [ ]:
segtype_counter

NO, it's fine now.

how are multinuc relations signalled (edge attribs, dom node attribs, dominated node attrs)?


In [ ]:
DOC_ID = 'maz-00002'

rst_graph = RSTGraph(os.path.join(RST_DIR, DOC_ID+'.rs3'))

rst_graph.node['rst:20']

In [ ]:
rst_graph['rst:20']

In [ ]:
for neighbor in rst_graph.neighbors('rst:20'):
    print neighbor, rst_graph.node[neighbor]

can multinucs dominate spans additionaly?


In [ ]:
for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    for dom_node, relname, toks in get_rst_relations(rst_graph):
        if 'rst:group' in rst_graph.node[dom_node]['layers'] \
        and rst_graph.node[dom_node]['rst:group_type'] == 'multinuc':
            for target in rst_graph[dom_node]:
                for edge in rst_graph[dom_node][target]:
                    if rst_graph[dom_node][target][edge]['rst:rel_name'] == 'span':
                        print dom_node

No.

Are there RST relations with non-adjacent segments?


In [4]:
import sys
from collections import defaultdict

def are_rst_spans_continuous(rst_spans):
    
    span_dict = defaultdict(list)
    for rel_id, seg_id, relname, start, end in rst_spans:
        span_dict[rel_id].append( (start, end) )
    
    for rel in span_dict:
        rel_start, rel_end = sys.maxint, 0
        rel_elements = set()
        for start, end in span_dict[rel]:
            [rel_elements.add(tok_idx) for tok_idx in range(start, end+1)]
            if start < rel_start: rel_start = start
            if end > rel_end: rel_end = end
        if not all(rel_idx in rel_elements for rel_idx in range(rel_start, rel_end+1)):
            return False
    return True

In [5]:
PCC_ROOTDIR = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/')
RST_DIR = os.path.join(PCC_ROOTDIR, 'rst')
rst_files = !ls $RST_DIR/*.rs3

for rst_file in rst_files:
    rst_graph = RSTGraph(rst_file)
    rst_spans = get_rst_spans(rst_graph)
    if not are_rst_spans_continuous(rst_spans):
        print os.path.basename(rst_file)


maz-10207.rs3
maz-11299.rs3
maz-11766.rs3
maz-12188.rs3
maz-13915.rs3
maz-14654.rs3
maz-15609.rs3
maz-18750.rs3
maz-19436.rs3
maz-3367.rs3
maz-4031.rs3
maz-4403.rs3
maz-4472.rs3
maz-5012.rs3
maz-6165.rs3
maz-6728.rs3
maz-6918.rs3
maz-8288.rs3
maz-8563.rs3
maz-8838.rs3
maz-9335.rs3
maz-9891.rs3

Yes, there are RST relations with non-adjacent segments

See maz-10207.rs3 interpretation relations in the beginning... (it's legal according to the rs3 file (which lists interpretation as an rst relation, not a multinuc

Are there any overlapping segments within RST relations?


In [ ]: