In [2]:
import os
import itertools
import pandas as pd
from discoursegraphs.readwrite import RSTGraph, ConanoDocumentGraph
from discoursegraphs.readwrite.rst import get_rst_relation_root_nodes, get_rst_relations, get_rst_spans
from discoursegraphs.util import TokenMapper
PCC_ROOTDIR = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/')
RST_DIR = os.path.join(PCC_ROOTDIR, 'rst')
CONANO_DIR = os.path.join(PCC_ROOTDIR, 'connectors')
In [4]:
rst_graph = RSTGraph(os.path.join(RST_DIR, 'maz-00001.rs3'))
rst_graph_untokenized = RSTGraph(os.path.join(RST_DIR, 'maz-00001.rs3'), tokenize=False)
# get_segment_spans_from_rst_relation(rst_graph, 'rst:40')
In [5]:
rst_files = !ls $RST_DIR/*.rs3
conano_files = !ls $CONANO_DIR/*.xml
In [6]:
def get_segment_token_offsets(segment_token_list, token_map):
"""
Parameters
----------
segment_token_list : list of str
sorted list of token IDs (i.e. the tokens
that this segment spans)
token_mapper : dict of (str, int)
a map from token IDs to token indices
Returns
-------
first_token_index : int
index of the first token of the segment
last_token_index : int
index of the last token of the segment
"""
return token_map[segment_token_list[0]], token_map[segment_token_list[-1]]
In [7]:
def get_rst_spans(docgraph):
"""
"""
token_map = TokenMapper(docgraph).id2index
for dom_node_id, rel_name, token_ids in get_rst_relations(docgraph):
segments = get_segment_spans_from_rst_relation(docgraph, dom_node_id)
for segment_pair in itertools.combinations(segments, 2):
for seg_id in segment_pair:
start, end = get_segment_token_offsets(segments[seg_id], token_map)
yield (dom_node_id+':'+seg_id, rel_name, start, end)
In [8]:
from discoursegraphs.readwrite.conano import get_conano_units, get_connective
def get_conano_spans(docgraph):
"""
"""
token_map = TokenMapper(docgraph).id2index
for unit_id, token_ids in get_conano_units(docgraph):
first_tok, last_tok = get_segment_token_offsets(token_ids, token_map)
yield (unit_id, get_connective(docgraph, unit_id),
first_tok, last_tok)
In [9]:
from discoursegraphs import select_nodes_by_layer
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
doc_id = rst_graph.name.split('.')[0]
# conano_graph = ConanoDocumentGraph(os.path.join(CONANO_DIR, doc_id+'.xml'))
# rst_spans = get_rst_spans(rst_graph)
# try:
# rst_span_df = pd.DataFrame(rst_spans, columns=['segment-id', 'relation-name', 'start-token', 'end-token'])
# rst_span_df.to_csv('/tmp/{}_rst_spans.csv'.format(doc_id), index=False)
# except Exception as e:
# print "{}: contains wrong RST spans".format(doc_id), e
# try:
# conano_spans = get_conano_spans(conano_graph)
# conano_span_df = pd.DataFrame(conano_spans, columns=['unit-id', 'connective', 'start-token', 'end-token'])
# conano_span_df.to_csv('/tmp/{}_conano_spans.csv'.format(doc_id), index=False, encoding="utf-8")
# except Exception as e:
# print "{}: contains wrong Conano spans".format(doc_id), e
In [13]:
from collections import Counter
dom_segment_types = Counter()
for rst_file in rst_files[:2]:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relation_root_nodes(rst_graph):
if 'rst:segment' in rst_graph.node[dom_node]['layers']:
dom_segment_types[rst_graph.node[dom_node]['rst:segment_type']] += 1
if rst_graph.node[dom_node]['rst:segment_type'] == 'span':
print os.path.basename(rst_file), dom_node
In [ ]:
dom_segment_types # nuc and span
In [14]:
dom_segment_types = Counter()
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relation_root_nodes(rst_graph):
if 'rst:group' in rst_graph.node[dom_node]['layers']:
dom_segment_types[rst_graph.node[dom_node]['rst:segment_type']] += 1
In [15]:
dom_segment_types
Out[15]:
In [ ]:
dominated_segment_types = Counter()
for rst_file in rst_files[:1]:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relations(rst_graph):
if 'rst:group' in rst_graph.node[dom_node]['layers']:
for neighbor, neighbor_attrs in select_neighbors_by_layer(rst_graph,
dom_node,
layer={'rst:segment', 'rst:group'},
data=True):
dominated_segment_types[neighbor_attrs['rst:segment_type']] += 1
if neighbor_attrs['rst:segment_type'] == 'span':
print os.path.basename(rst_file), dom_node, relname
In [ ]:
dominated_segment_types
In [ ]:
from collections import defaultdict
import os
from discoursegraphs import select_neighbors_by_layer
dominated_segment_count = defaultdict(list)
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relations(rst_graph):
if 'rst:segment' in rst_graph.node[dom_node]['layers']:
rst_neighbors = list(select_neighbors_by_layer(rst_graph, dom_node, layer={'rst:segment', 'rst:group'}))
dominated_segment_count[len(rst_neighbors)].append( (os.path.basename(rst_file), dom_node) )
In [ ]:
dominated_segment_count.keys()
In [ ]:
for dom_num in dominated_segment_count:
print dom_num, "count: ", len(dominated_segment_count[dom_num])
In [ ]:
dominated_segment_count[2][0]
In [ ]:
DOC_ID = 'maz-10205'
NODE_ID = 'rst:5'
rst_graph = RSTGraph(os.path.join(RST_DIR, DOC_ID+'.rs3'))
print rst_graph.node[NODE_ID], '\n\n'
print list(select_neighbors_by_layer(rst_graph, NODE_ID, layer={'rst:segment', 'rst:group'}))
for dominated_node in rst_graph[NODE_ID]:
print dominated_node, rst_graph.node[dominated_node]['layers']
In [ ]:
from discoursegraphs import EdgeTypes
dominated_span_count = Counter()
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relations(rst_graph, data=True):
# print dom_node, relname
span_neighbors = []
for neighbor in select_neighbors_by_layer(rst_graph, dom_node, layer={'rst:segment', 'rst:group'}):
for edge in rst_graph.edge[dom_node][neighbor]: # this is a multi-digraph
if rst_graph.edge[dom_node][neighbor][edge]['edge_type'] == EdgeTypes.spanning_relation:
span_neighbors.append(neighbor)
dominated_span_count[len(span_neighbors)] += 1
In [ ]:
dominated_span_count
In [ ]:
from collections import Counter
segtype_counter = Counter()
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
for node_id, node_attrs in select_nodes_by_layer(rst_graph, layer={'rst:segment', 'rst:group'}, data=True):
segtype_counter[node_attrs['rst:segment_type']] += 1
# print os.path.basename(rst_file), node_id, node_attrs
In [ ]:
segtype_counter
In [ ]:
DOC_ID = 'maz-00002'
rst_graph = RSTGraph(os.path.join(RST_DIR, DOC_ID+'.rs3'))
rst_graph.node['rst:20']
In [ ]:
rst_graph['rst:20']
In [ ]:
for neighbor in rst_graph.neighbors('rst:20'):
print neighbor, rst_graph.node[neighbor]
In [ ]:
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
for dom_node, relname, toks in get_rst_relations(rst_graph):
if 'rst:group' in rst_graph.node[dom_node]['layers'] \
and rst_graph.node[dom_node]['rst:group_type'] == 'multinuc':
for target in rst_graph[dom_node]:
for edge in rst_graph[dom_node][target]:
if rst_graph[dom_node][target][edge]['rst:rel_name'] == 'span':
print dom_node
In [4]:
import sys
from collections import defaultdict
def are_rst_spans_continuous(rst_spans):
span_dict = defaultdict(list)
for rel_id, seg_id, relname, start, end in rst_spans:
span_dict[rel_id].append( (start, end) )
for rel in span_dict:
rel_start, rel_end = sys.maxint, 0
rel_elements = set()
for start, end in span_dict[rel]:
[rel_elements.add(tok_idx) for tok_idx in range(start, end+1)]
if start < rel_start: rel_start = start
if end > rel_end: rel_end = end
if not all(rel_idx in rel_elements for rel_idx in range(rel_start, rel_end+1)):
return False
return True
In [5]:
PCC_ROOTDIR = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/')
RST_DIR = os.path.join(PCC_ROOTDIR, 'rst')
rst_files = !ls $RST_DIR/*.rs3
for rst_file in rst_files:
rst_graph = RSTGraph(rst_file)
rst_spans = get_rst_spans(rst_graph)
if not are_rst_spans_continuous(rst_spans):
print os.path.basename(rst_file)
In [ ]: