In [1]:
%matplotlib inline
# %install_ext https://raw.github.com/cjdrake/ipython-magic/master/gvmagic.py
%load_ext gvmagic
import os
from collections import Counter
from operator import itemgetter
import pandas as pd
from discoursegraphs import (info, print_dot, get_span, select_nodes_by_layer,
select_neighbors_by_layer, istoken, tokens2text)
from discoursegraphs.readwrite import ConanoDocumentGraph, RSTGraph
from discoursegraphs.readwrite.rst import get_rst_relations, get_segment_spans_from_rst_relation
from discoursegraphs.readwrite.conano import get_conano_units, get_connective
from discoursegraphs.util import TokenMapper, natural_sort_key
In [2]:
DOC_ID = 'maz-2316'
MAZ_ROOTDIR = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/')
CONANO_TESTFILE = os.path.join(MAZ_ROOTDIR, 'connectors/{}.xml'.format(DOC_ID))
RST_TESTFILE = os.path.join(MAZ_ROOTDIR, 'rst/{}.rs3'.format(DOC_ID))
In [3]:
def segment2text(docgraph, segment_node):
return ' '.join(docgraph.get_token(tok_id)
for tok_id in get_span(docgraph, segment_node))
In [4]:
cdg = ConanoDocumentGraph(CONANO_TESTFILE)
rdg = RSTGraph(RST_TESTFILE)
In [5]:
rdg.merge_graphs(cdg)
merged_graph = rdg # you don't need to do this; new variable is only introduced for convenience
In [6]:
rdg_unotokenized = RSTGraph(RST_TESTFILE, tokenize=False)
#info(rdg_unotokenized)
In [7]:
#%dotstr print_dot(rdg_unotokenized)
In [8]:
connectives_only = ConanoDocumentGraph(CONANO_TESTFILE)
#info(connectives_only)
#info(merged_graph)
In [9]:
#%dotstr print_dot(merged_graph)
In [10]:
token_mapper = TokenMapper(merged_graph)
unit_ranges = []
for unit_id, token_ids in get_conano_units(merged_graph):
unit_ranges.append( (unit_id,
get_connective(merged_graph, unit_id),
token_mapper.id2index[token_ids[0]],
token_mapper.id2index[token_ids[-1]]) )
In [11]:
pd.DataFrame(unit_ranges, columns=['unit-id', 'connective', 'start-token', 'end-token'])
Out[11]:
In [12]:
rst_ranges = []
for dom_node_id, rel_name, token_ids in get_rst_relations(merged_graph):
rst_ranges.append( (dom_node_id, rel_name,
token_mapper.id2index[token_ids[0]],
token_mapper.id2index[token_ids[-1]]) )
In [13]:
pd.DataFrame(rst_ranges, columns=['relation-id', 'relation-name', 'start-token', 'end-token'])
Out[13]:
In [14]:
for dom_node_id in get_rst_relations(merged_graph, data=False):
print dom_node_id, merged_graph.node[dom_node_id]['layers'], merged_graph.node[dom_node_id]['rst:segment_type'], merged_graph.node[dom_node_id].get('rst:group_type')
for neighbor in merged_graph.neighbors(dom_node_id):
layers = merged_graph.node[neighbor]['layers']
if 'rst:segment' in layers or 'rst:group' in layers:
print u"\t{0} {1}: {2}".format(neighbor,
merged_graph.node[neighbor]['rst:segment_type'],
segment2text(merged_graph, neighbor))
print '\n'
In [15]:
for dom_node_id in get_rst_relations(merged_graph, data=False):
if 'rst:segment' in merged_graph.node[dom_node_id]['layers']:
print dom_node_id, sorted([n for n in merged_graph.neighbors(dom_node_id)
if istoken(merged_graph, n)], key=natural_sort_key)
In [17]:
for rel_id in get_rst_relations(merged_graph, data=False):
print rel_id
segment_spans = get_segment_spans_from_rst_relation(merged_graph, rel_id)
for span, span_tokens in segment_spans.iteritems():
print "\t", span, tokens2text(merged_graph, span_tokens)
print
In [40]:
def get_segment_token_offsets(segment_token_list, token_map):
"""
Parameters
----------
segment_token_list : list of str
sorted list of token IDs (i.e. the tokens
that this segment spans)
token_mapper : dict of (str, int)
a map from token IDs to token indices
Returns
-------
first_token_index : int
index of the first token of the segment
last_token_index : int
index of the last token of the segment
"""
return token_map[segment_token_list[0]], token_map[segment_token_list[-1]]
In [55]:
# for dom_node_id, rel_name, token_ids in get_rst_relations(merged_graph):
# rst_ranges.append( (dom_node_id, rel_name,
# token_mapper.id2index[token_ids[0]],
# token_mapper.id2index[token_ids[-1]]) )
token_map = TokenMapper(merged_graph).id2index
rst_segment_ranges = []
for dom_node_id, rel_name, token_ids in get_rst_relations(merged_graph):
# print dom_node_id
segments = get_segment_spans_from_rst_relation(merged_graph, dom_node_id)
for segment_pair in itertools.combinations(segments, 2):
for seg_id in segment_pair:
start, end = get_segment_token_offsets(segments[seg_id], token_map)
# print '\t', dom_node_id+':'+seg_id, rel_name, start, end
rst_segment_ranges.append( (dom_node_id+':'+seg_id, rel_name, start, end) )
In [80]:
import networkx as nx
for node_id in nx.topological_sort(merged_graph):
if node_id in list(select_nodes_by_layer(merged_graph, layer={'rst:segment', 'rst:group'})):
if merged_graph.node[node_id].get('rst:segment_type') != 'isolated':
print node_id,
In [77]:
merged_graph.node['rst:1']
Out[77]:
In [81]:
rst_segments_df = pd.DataFrame(rst_segment_ranges, columns=['segment-id', 'rst-relation', 'start-token', 'end-token'])
rst_segments_df
Out[81]:
In [82]:
rst_segments_df.save?
In [ ]: