In [1]:
import os
from operator import itemgetter
import pygraphviz
import networkx as nx
from discoursegraphs.readwrite import TigerDocumentGraph
from discoursegraphs.util import natural_sort_key
# DOC_ID = 'maz-19295'
DOC_ID = 'maz-00002'
TIGER_DIR = '~/repos/pcc-annis-merged/maz176/syntax/'
RST_DIR = '~/repos/pcc-annis-merged/maz176/rst/'
MMAX_DIR = '~/repos/pcc-annis-merged/maz176/coreference/'
In [2]:
def add_precedence_relations(dot_graph, token_ids, **attr):
"""
builds an invisible precedence chain, i.e. adds an
invisible edge from the first token node to the second, from the
second to the third etc.
additional attributes: constraint='false', style='invis', weight=1...
"""
sorted_token_ids = sorted(token_ids, key=natural_sort_key)
for i, token_id in enumerate(sorted_token_ids[:-1]):
dot_graph.add_edge(token_id, sorted_token_ids[i+1], **attr)
def add_token_subgraph(dot_graph, token_ids=None, subgraph_id=None,
cluster=True, **attr):
"""
adds a (cluster) subgraph to the given dotgraph that includes the
given token nodes. Add ``rank=same`` to put them on the same rank.
Returns
-------
token_subgraph : pygraphviz.agraph.AGraph
a cluster subgraph that contains all the token nodes
"""
subgraph_name = 'cluster_tokens' if cluster else 'tokens'
root_graph = dot_graph.subgraph_root() # returns None, if dot_graph is a root graph
if subgraph_id is None:
if root_graph:
subgraph_id = len(root_graph.subgraphs())
else:
subgraph_id = len(dot_graph.subgraphs())
if token_ids:
token_ids = sorted(token_ids, key=natural_sort_key)
return dot_graph.add_subgraph(nbunch=token_ids,
name="{}{}".format(subgraph_name, subgraph_id),
**attr)
def add_sentence_subgraph(dot_graph, sentence_root_node_ids):
"""
adds a subgraph to the given dotgraph that puts all of its
sentence root nodes on the same rank.
Returns
-------
sentence_subgraph : pygraphviz.agraph.AGraph
a subgraph that contains all sentence root nodes
"""
return dot_graph.add_subgraph(sentence_root_node_ids,
name='sentence-roots', rank='same')
In [3]:
def get_bottom_rst_spans(reversed_docgraph):
"""
returns a list of (rst segment node id, list of token node ids, list of parent segment node ids) tuples.
"""
token_spans = []
for nid, nattrs in reversed_docgraph.nodes(data=True):
if 'rst:segment' in nattrs['layers']:
token_span = [token_node for (token_node, segment_node) in reversed_docgraph.in_edges(nid)
if 'rst:token' in reversed_docgraph.node[token_node]['layers']]
if token_span:
parent_segments = [target_node for (source_node, target_node) in reversed_docgraph.out_edges(nid)]
token_spans.append( (nid, sorted(token_span, key=natural_sort_key), parent_segments) )
return token_spans
In [4]:
def nx2simpledot(nxgraph):
"""converts a networkx multidigraph into a dot digraph, removing all unneccessary information"""
digraph = nx.DiGraph(nxgraph) # convert multidigraph to digraph
dotgraph = pygraphviz.AGraph(directed=True)
for node_id in nxgraph.tokens:
dotgraph.add_node(node_id, label=digraph.node[node_id]['label'])
non_terminals = set(digraph.nodes_iter()).difference(nxgraph.tokens)
for node_id in non_terminals:
if 'label' in digraph.node[node_id]:
dotgraph.add_node(node_id, label=digraph.node[node_id]['label'])
else:
dotgraph.add_node(node_id)
for source, target, edge_attrs in digraph.edges_iter(data=True):
if 'label' in digraph.edge[source][target]:
dotgraph.add_edge(source, target, label=digraph.edge[source][target]['label'])
else:
dotgraph.add_edge(source, target)
return dotgraph
In [5]:
tiger_filepath = os.path.join(os.path.expanduser(TIGER_DIR), DOC_ID+'.xml')
tdg = TigerDocumentGraph(tiger_filepath)
nx_dotgraph = nx2simpledot(tdg)
nx_dotgraph.graph_attr['newrank'] = 'true'
nx_dotgraph.node_attr['fontname'] = 'Sans-Serif'
nx_dotgraph.edge_attr['fontname'] = 'Sans-Serif'
add_token_subgraph(nx_dotgraph, tdg.tokens, cluster=False, rank='same')
add_precedence_relations(nx_dotgraph, tdg.tokens, style='dotted')
# add_sentence_subgraph(nx_dotgraph, tdg.sentences)
nx_dotgraph.write('tiger.dot'.format(DOC_ID))
In [6]:
from discoursegraphs.readwrite import RSTGraph
rst_filepath = os.path.join(os.path.expanduser(RST_DIR), DOC_ID+'.rs3')
rdg = RSTGraph(rst_filepath)
rdg_dotgraph = nx2simpledot(rdg)
rdg_dotgraph.graph_attr['newrank'] = 'true'
rdg_dotgraph.node_attr['fontname'] = 'Sans-Serif'
rdg_dotgraph.edge_attr['fontname'] = 'Sans-Serif'
add_token_subgraph(rdg_dotgraph, rdg.tokens, cluster=False, rank='same')
add_precedence_relations(rdg_dotgraph, rdg.tokens, style='dotted')
rdg_dotgraph.write('rst.dot'.format(DOC_ID))
In [7]:
from networkx import write_dot
rdg = RSTGraph(rst_filepath)
rdg.reverse(copy=False)
reversed_rdg = rdg
reversed_rst_dotgraph = nx2simpledot(reversed_rdg)
reversed_rst_dotgraph.graph_attr['newrank'] = 'true'
reversed_rst_dotgraph.node_attr['fontname'] = 'Sans-Serif'
reversed_rst_dotgraph.edge_attr['fontname'] = 'Sans-Serif'
add_token_subgraph(reversed_rst_dotgraph, reversed_rdg.tokens, cluster=False, rank='same')
add_precedence_relations(reversed_rst_dotgraph, reversed_rdg.tokens, style='dotted')
reversed_rst_dotgraph.write('rst-reversed.dot'.format(DOC_ID))
In [8]:
tdg.merge_graphs(reversed_rdg)
reversed_tdg_rdg_dotgraph = nx2simpledot(tdg)
reversed_tdg_rdg_dotgraph.graph_attr['newrank'] = 'true'
reversed_tdg_rdg_dotgraph.graph_attr['compound'] = 'true'
reversed_tdg_rdg_dotgraph.node_attr['fontname'] = 'Sans-Serif'
reversed_tdg_rdg_dotgraph.edge_attr['fontname'] = 'Sans-Serif'
add_token_subgraph(reversed_tdg_rdg_dotgraph, tdg.tokens, cluster=False, rank='same')
add_precedence_relations(reversed_tdg_rdg_dotgraph, tdg.tokens, style='dotted')
reversed_tdg_rdg_dotgraph.write('tiger-rst.dot'.format(DOC_ID))
In [9]:
tdg = TigerDocumentGraph(tiger_filepath)
rdg = RSTGraph(rst_filepath)
rdg.reverse(copy=False)
reversed_rdg = rdg
tdg.merge_graphs(reversed_rdg)
reversed_tdg_rdg_dotgraph = nx2simpledot(tdg)
bottom_rst_spans = get_bottom_rst_spans(reversed_rdg)
tdg2 = TigerDocumentGraph(tiger_filepath)
tiger_digraph = nx.DiGraph(tdg2) # convert multidigraph to digraph
result_dotgraph = pygraphviz.AGraph(directed=True)
result_dotgraph.graph_attr['newrank'] = 'true'
result_dotgraph.graph_attr['compound'] = 'true'
result_dotgraph.node_attr['fontname'] = 'Sans-Serif'
result_dotgraph.edge_attr['fontname'] = 'Sans-Serif'
for i, (segment_id, span, parent_ids) in enumerate(bottom_rst_spans):
cluster = add_token_subgraph(result_dotgraph, span, cluster=True, subgraph_id=i, rank='same')
cluster.node_attr['shape'] = 'plaintext'
for node_id in span:
cluster.add_node(node_id, label=u"{}_{}".format(node_id, tiger_digraph.node[node_id]['label']))
token_subgraph = add_token_subgraph(cluster, cluster=False, subgraph_id=i)
# token_subgraph.graph_attr['rankdir'] = 'LR'
add_precedence_relations(token_subgraph, span, style='dotted', weight='1000')
non_terminals = set(tiger_digraph.nodes_iter()).difference(tdg2.tokens)
for node_id in non_terminals:
if 'label' in tiger_digraph.node[node_id]:
result_dotgraph.add_node(node_id, label=u"{}_{}".format(node_id, tiger_digraph.node[node_id]['label']))
else:
result_dotgraph.add_node(node_id)
for source, target, edge_attrs in sorted(tiger_digraph.edges_iter(data=True), key=itemgetter(1)):
if 'label' in tiger_digraph.edge[source][target]:
result_dotgraph.add_edge(source, target, label=tiger_digraph.edge[source][target]['label'])
else:
result_dotgraph.add_edge(source, target)
result_dotgraph.write('clusters.dot'.format(DOC_ID))
In [32]:
def gen_edge(source, target, **attr):
return u'"{}" -> "{}" [{}];\n'.format(source, target, ', '.join('{}={}'.format(k,v) for k,v in attr.items()))
def gen_node(name, **attr):
return u'"{}" [{}];\n'.format(name, u', '.join(u'{}="{}"'.format(k,v) for k,v in attr.items()))
def gen_subgraph(name, subgraphs=None, nodes=None, edges=None,
graph_attrs={'rank': 'same', 'newrank': 'true'},
node_attrs={},
edge_attrs={}):
return u"""
subgraph {} {{
graph [{}];
node [{}];
edge [{}];
{}
{}
{}
}}\n\n""".format(name,
u', '.join(u'{}="{}"'.format(k,v) for k,v in graph_attrs.items()),
u', '.join(u'{}="{}"'.format(k,v) for k,v in node_attrs.items()),
u', '.join(u'{}="{}"'.format(k,v) for k,v in edge_attrs.items()),
'\t'.join(nodes) if nodes else '',
'\t'.join(edges) if edges else '',
''.join(subgraphs) if subgraphs else '')
def gen_cluster(name, subgraphs=None, nodes=None, edges=None,
graph_attrs={'rank': 'same', 'newrank': 'true'},
node_attrs={},
edge_attrs={}):
return gen_subgraph("cluster_"+name, subgraphs, nodes, edges,
graph_attrs, node_attrs, edge_attrs)
def gen_digraph(subgraphs=None, nodes=None, edges=None,
graph_attrs={'newrank': 'true', 'compound': 'true'},
node_attrs={'fontname': 'Sans-Serif'},
edge_attrs={'fontname': 'Sans-Serif'}):
return u"""
digraph {{
graph [{}];
node [{}];
edge [{}];
{}
{}
{}
}}
""".format(u', '.join(u'{}="{}"'.format(k,v) for k,v in graph_attrs.items()),
u', '.join(u'{}="{}"'.format(k,v) for k,v in node_attrs.items()),
u', '.join(u'{}="{}"'.format(k,v) for k,v in edge_attrs.items()),
''.join(subgraphs) if subgraphs else '',
'\t'.join(nodes) if nodes else '',
'\t'.join(edges) if edges else '')
In [33]:
def reversedrst2simpledot(nxgraph):
digraph = nx.DiGraph(nxgraph) # convert multidigraph to digraph
digraph.tokens = nxgraph.tokens
dotgraph = pygraphviz.AGraph(directed=True)
dotgraph.graph_attr['newrank'] = 'true'
dotgraph.graph_attr['compound'] = 'true'
dotgraph.node_attr['fontname'] = 'Sans-Serif'
dotgraph.edge_attr['fontname'] = 'Sans-Serif'
for node_id in nxgraph.tokens:
dotgraph.add_node(node_id, label=digraph.node[node_id]['label'])
bottom_rst_spans = get_bottom_rst_spans(digraph)
bottom_rst_nodes = [segment_node for (segment_node, token_nodes, parent_nodes)
in bottom_rst_spans]
hierarchy_nodes = [node for node in digraph.nodes()
if node not in digraph.tokens
and node not in bottom_rst_nodes]
for node_id in hierarchy_nodes:
if 'label' in digraph.node[node_id]:
dotgraph.add_node(node_id, label=digraph.node[node_id]['label'])
else:
dotgraph.add_node(node_id)
# leaving out segment nodes by connecting token nodes directly to
# the parent nodes of segment nodes
for (segment_node, token_nodes, parent_nodes) in bottom_rst_spans:
cluster_name = 'cluster_segment{}'.format(segment_node)
segment_cluster = dotgraph.add_subgraph(token_nodes,
name=cluster_name,
rank='same')
# set up color and label for the segment cluster
segment_cluster.graph_attr['style'] = 'filled'
segment_cluster.graph_attr['color'] = 'lightgrey'
segment_cluster.graph_attr['label'] = 'segment {}'.format(segment_node)
segment_cluster.node_attr['style'] = 'filled'
segment_cluster.node_attr['color'] = 'white'
token_subgraph = segment_cluster.add_subgraph(name='tokens_segment{}'.format(segment_node), rank='same')
# add invisible edges between the tokens of a segment
for i, token_node in enumerate(token_nodes[:-1]):
token_subgraph.add_edge(token_node, token_nodes[i+1], style='invis')
# segment_cluster.add_edge(token_node, token_nodes[i+1], style='invis')
# add an edge from the first token of a segment to all the parent nodes
# of the segment (with its tail pointing to the cluster)
for parent_node in parent_nodes:
dotgraph.add_edge(token_nodes[0], parent_node, ltail=cluster_name)
# add all other, hierarchical edges
for source, target, edge_attrs in digraph.edges_iter(data=True):
if source in hierarchy_nodes: # don't add edges from tokens or bottom segments
if 'label' in digraph.edge[source][target]:
dotgraph.add_edge(source, target, label=digraph.edge[source][target]['label'])
else:
dotgraph.add_edge(source, target)
return dotgraph
In [34]:
tdg = TigerDocumentGraph(tiger_filepath)
rdg = RSTGraph(rst_filepath)
rdg.reverse(copy=False)
reversed_rdg = rdg # just a new name to clarify things
from discoursegraphs.discoursegraph import rename_tokens
rename_tokens(reversed_rdg, tdg) # use Tiger token IDs in the RST graph
reversedrst_dot = reversedrst2simpledot(reversed_rdg)
reversedrst_dot.write("rst-reversed-clusters.dot")
In [39]:
def reversedrst2manualdot(nxgraph):
digraph = nx.DiGraph(nxgraph) # convert multidigraph to digraph
digraph.tokens = nxgraph.tokens
bottom_rst_spans = get_bottom_rst_spans(digraph)
bottom_rst_nodes = [segment_node for (segment_node, token_nodes, parent_nodes)
in bottom_rst_spans]
hierarchy_node_ids = [node for node in digraph.nodes()
if node not in digraph.tokens
and node not in bottom_rst_nodes]
# generate hierarchical nodes (non-terminals, non-bottom-rst-segments)
hierarchy_nodes = []
for node_id in hierarchy_node_ids:
if 'label' in digraph.node[node_id]:
hierarchy_nodes.append(gen_node(node_id,
label=digraph.node[node_id]['label']))
else:
hierarchy_nodes.append(gen_node(node_id))
# generate hierarchical edges (i.e. edges that don't start from a token node)
hierarchical_edges = []
for source, target, edge_attrs in digraph.edges_iter(data=True):
# don't add edges from tokens or bottom segments
if source in hierarchy_node_ids:
if 'label' in digraph.edge[source][target]:
hierarchical_edges.append(gen_edge(source,
target,
label=digraph.edge[source][target]['label']))
else:
hierarchical_edges.append(gen_edge(source, target))
# leaving out segment nodes by connecting token nodes directly to
# the parent nodes of segment nodes
segment_clusters = []
token2segment_edges = []
cluster_precedences
for (segment_node_id, token_node_ids, parent_node_ids) in bottom_rst_spans:
# create all token nodes in a bottom rst segment
token_nodes = []
for token_node_id in token_node_ids:
token_nodes.append(gen_node(token_node_id,
label=digraph.node[token_node_id]['label']))
# generate invisible edges between the tokens of a segment
token_precedences = []
for i, token_node_id in enumerate(token_node_ids[:-1]):
token_precedences.append(gen_edge(token_node_id, token_node_ids[i+1],
style='invis'))
# add all token nodes (and their precedence edges) to a subgraph
token_subgraph = gen_subgraph(name='tokens_segment{}'.format(segment_node_id),
nodes=token_nodes, edges=token_precedences,
graph_attrs={'rank': 'same'})
# add the subgraph to a bottom rst segment cluster
cluster_name = 'segment{}'.format(segment_node_id)
segment_cluster = gen_cluster(name=cluster_name, subgraphs=[token_subgraph],
graph_attrs={'rank': 'same', 'style': 'filled',
'color': 'lightgrey',
'label': cluster_name},
node_attrs={'style': 'filled',
'color': 'white'})
# generate edges from the first token of a segment to all the parent nodes
# of the segment (with its tail pointing to the cluster)
for parent_node_id in parent_node_ids:
token2segment_edges.append(gen_edge(token_node_ids[0],
parent_node_id,
ltail=cluster_name))
dotgraph = gen_digraph(subgraphs=segment_clusters,
nodes=hierarchy_nodes,
edges=token2segment_edges+hierarchical_edges+cluster_precedences)
return dotgraph
In [40]:
tdg = TigerDocumentGraph(tiger_filepath)
rdg = RSTGraph(rst_filepath)
rdg.reverse(copy=False)
reversed_rdg = rdg # just a new name to clarify things
from discoursegraphs.discoursegraph import rename_tokens
rename_tokens(reversed_rdg, tdg) # use Tiger token IDs in the RST graph
reversedrst_dot = reversedrst2manualdot(reversed_rdg)
reversedrst_dot.write("rst-reversed-manual-clusters.dot")
In [13]:
tdg = TigerDocumentGraph(tiger_filepath)
rdg = RSTGraph(rst_filepath)
rdg.reverse(copy=False)
reversed_rdg = rdg
tdg.merge_graphs(reversed_rdg)
reversed_tdg_rdg_dotgraph = nx2simpledot(tdg)
bottom_rst_spans = get_bottom_rst_spans(reversed_rdg)
tdg2 = TigerDocumentGraph(tiger_filepath)
tiger_digraph = nx.DiGraph(tdg2) # convert multidigraph to digraph
token_subgraphs = []
for i, (segment_id, span, parent_ids) in enumerate(bottom_rst_spans):
span_nodes = [gen_node(node_id, label=tiger_digraph.node[node_id]['label']) for node_id in span]
precedence_edges = [gen_edge(node, span[idx+1], constraint='true', style='dotted')
for idx, node in enumerate(span[:-1])]
token_subgraphs.append(gen_subgraph(name='tokens{}'.format(i), nodes=span_nodes, edges=precedence_edges))
non_terminals = set(tiger_digraph.nodes_iter()).difference(tdg2.tokens)
nonterminal_nodes = [gen_node(node_id, label=tiger_digraph.node[node_id].get('label', node_id))
for node_id in non_terminals]
nonprecedence_edges = []
for source, target, edge_attrs in sorted(tiger_digraph.edges_iter(data=True), key=itemgetter(1)):
if 'label' in edge_attrs:
nonprecedence_edges.append(gen_edge(source, target, label=tiger_digraph.edge[source][target]['label']))
else:
nonprecedence_edges.append(gen_edge(source, target))
digraph = gen_digraph(subgraphs=token_subgraphs, nodes=nonterminal_nodes,
edges=nonprecedence_edges)
In [14]:
with open('manual.dot', 'w') as outfile:
outfile.write(digraph.encode('utf8'))