In [1]:
%load_ext gvmagic
import os
import networkx as nx
from discoursegraphs import print_dot, info
from discoursegraphs.readwrite import TigerDocumentGraph
from nxpd import draw
TIGER_FILE = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/syntax/maz-1423.xml')
In [2]:
tdg = TigerDocumentGraph(TIGER_FILE) # multidigraph
info(tdg)
In [3]:
tdg_digraph = nx.DiGraph(tdg)
info(tdg_digraph)
In [4]:
# draw(tdg)
#InvocationException: Program terminated with status: 1. stderr follows: Error: /tmp/tmpfFx_6T:92: syntax error near line 92
#context: s525_15 >>> [label=, <<< ];
In [5]:
%dotstr print_dot(tdg_digraph)
In [28]:
def graph2string(graph):
result = u''
for source, target in nx.dfs_edges(graph, graph.root):
result += u'{0}#{1}#{2}##'.format(
graph.node[source].get('label', source),
graph[source][target].get('label', ' '),
graph.node[target].get('label', target))
return result
In [30]:
hash(graph2string(tdg))
Out[30]:
In [6]:
tdg_digraph['s538_504']['s538_12']
Out[6]:
In [7]:
from discoursekernels.dependency_graph import (
get_dependency_rules, get_dependency_subgraphs, generate_all_unique_dependency_subgraphs)
In [8]:
list(get_dependency_rules(tdg_digraph))[:5]
# TODO: filter root node, VROOT nodes
Out[8]:
In [9]:
# intractably slow
# list(get_dependency_subgraphs(tdg_digraph))
In [31]:
from discoursekernels.tree import tree_kernel_polynomial
In [37]:
for node_id in tdg_digraph:
if 'label' not in tdg_digraph.node[node_id]:
tdg_digraph.node[node_id]['label'] = node_id
In [38]:
%time tree_kernel_polynomial(tdg_digraph, tdg_digraph)
Out[38]:
202M -rw-r--r-- 1 arne arne 202M Dec 14 13:49 maz-00001.pickle
99M -rw-r--r-- 1 arne arne 99M Dec 14 13:51 maz-00002.pickle
15M -rw-r--r-- 1 arne arne 15M Dec 14 13:52 maz-10110.pickle
32M -rw-r--r-- 1 arne arne 32M Dec 14 13:52 maz-10175.pickle
5.0M -rw-r--r-- 1 arne arne 5.0M Dec 14 13:53 maz-10205.pickle
1.3G -rw-r--r-- 1 arne arne 1.3G Dec 14 17:21 maz-10207.pickle
In [39]:
import os
import sys
import cPickle as pickle
from discoursegraphs.readwrite import TigerDocumentGraph, RSTGraph, ConanoDocumentGraph
from discoursekernels.subgraph_enumeration import enumerate_all_subgraphs_upto_size_k
def generate_merged_graph(tiger_file, rst_file, conano_file):
tdg = TigerDocumentGraph(tiger_file)
rdg = RSTGraph(rst_file)
cdg = ConanoDocumentGraph(conano_file)
tdg.merge_graphs(rdg)
tdg.merge_graphs(cdg)
return tdg
In [42]:
tiger_file = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/syntax/maz-10207.xml')
rst_file = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/rst/maz-10207.rs3')
conano_file = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/connectors/maz-10207.xml')
merged_graph = generate_merged_graph(tiger_file, rst_file, conano_file)
In [43]:
info(merged_graph)
In [44]:
nx.is_directed_acyclic_graph(merged_graph)
Out[44]:
In [48]:
def add_node_id_as_fallback_labels(graph):
for node_id in graph:
if 'label' not in graph.node[node_id]:
graph.node[node_id]['label'] = node_id
In [49]:
add_node_id_as_fallback_labels(merged_graph)
In [52]:
%time tree_kernel_polynomial(merged_graph, merged_graph)
Out[52]:
In [ ]:
In [45]:
%dotstr print_dot(merged_graph)
In [54]:
nx.write_dot(merged_graph, '/tmp/merged.dot')