In [1]:
from discoursekernels.util import print_source
from discoursekernels.subgraph_enumeration import (
open_neighborhood, exclusive_neighborhood, enumerate_all_size_k_subgraphs,
enumerate_all_size_k_subgraphs_parellel,
extend_subgraph,
enumerate_all_subgraphs_upto_size_k,
enumerate_all_subgraphs_upto_size_k_parallel)
print_source(open_neighborhood)
Out[1]:
In [2]:
print_source(exclusive_neighborhood)
Out[2]:
In [3]:
from nxpd import draw
from discoursekernels.test_dependency_graph import the_man_saw_the_woman_with_the_telescope
draw(the_man_saw_the_woman_with_the_telescope, show='ipynb')
Out[3]:
In [4]:
for n, attrs in the_man_saw_the_woman_with_the_telescope.nodes(data=True):
print n, attrs['label']
In [5]:
open_neighborhood(the_man_saw_the_woman_with_the_telescope, set([5]))
Out[5]:
In [6]:
open_neighborhood(the_man_saw_the_woman_with_the_telescope.to_undirected(), set([5]))
Out[6]:
In [7]:
exclusive_neighborhood(the_man_saw_the_woman_with_the_telescope.to_undirected(), 2, set([5, 6]))
Out[7]:
starting with a node $v$ from the input graph, we only add those nodes to the $V_{Extension}$ set that have these properties:
In [8]:
print_source(enumerate_all_size_k_subgraphs)
Out[8]:
In [9]:
print_source(extend_subgraph)
Out[9]:
In [10]:
import os
import networkx as nx
from discoursegraphs.readwrite import TigerDocumentGraph
TIGER_FILE = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/syntax/maz-1423.xml')
tdg = TigerDocumentGraph(TIGER_FILE)
In [11]:
tdg_digraph = nx.DiGraph(tdg)
# nx.convert_node_labels_to_integers(tdg, first_label=1, label_attribute='node_id')
tdg_with_in_labels = nx.convert_node_labels_to_integers(tdg_digraph, first_label=1, label_attribute='node_id')
# enumerate_all_size_k_subgraphs(tdg_with_in_labels, 10)
In [12]:
print_source(enumerate_all_subgraphs_upto_size_k)
Out[12]:
In [13]:
%timeit len(enumerate_all_subgraphs_upto_size_k(tdg_with_in_labels, 5))
In [14]:
# %timeit len(enumerate_all_subgraphs_upto_size_k(tdg_with_in_labels, 10)) # 20s per loop
In [15]:
from discoursegraphs import DiscourseDocumentGraph, print_dot
from discoursegraphs.readwrite import RSTGraph, MMAXDocumentGraph
In [16]:
rdg = RSTGraph(os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/rst/maz-1423.rs3'))
mdg = MMAXDocumentGraph(os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/coreference/maz-1423.mmax'))
In [17]:
merged_graph = DiscourseDocumentGraph()
merged_graph.merge_graphs(tdg.copy())
merged_graph.merge_graphs(rdg.copy())
merged_graph.merge_graphs(mdg.copy())
In [18]:
%load_ext gvmagic
In [19]:
# %dotstr print_dot(merged_graph)
In [20]:
len(merged_graph)
Out[20]:
In [21]:
%time len(enumerate_all_subgraphs_upto_size_k(merged_graph, 5))
Out[21]:
In [22]:
%time len(enumerate_all_subgraphs_upto_size_k(merged_graph, 6))
Out[22]:
In [23]:
%time len(enumerate_all_subgraphs_upto_size_k(merged_graph, 7))
Out[23]:
In [24]:
%time len(enumerate_all_subgraphs_upto_size_k(merged_graph, 8))
Out[24]:
In [27]:
print_source(enumerate_all_size_k_subgraphs_parellel)
Out[27]:
In [69]:
%time _x = enumerate_all_size_k_subgraphs_parellel(tdg_with_in_labels, 10)
In [70]:
%time _x = enumerate_all_size_k_subgraphs(tdg_with_in_labels, 5)
In [71]:
%time _x = enumerate_all_size_k_subgraphs_parellel(tdg_with_in_labels, 7)
In [72]:
%time _x = enumerate_all_size_k_subgraphs(tdg_with_in_labels, 7)
In [73]:
%time _x = enumerate_all_size_k_subgraphs_parellel(tdg_with_in_labels, 10)
In [74]:
%time _x = enumerate_all_size_k_subgraphs(tdg_with_in_labels, 10)
In [75]:
%time _x = enumerate_all_size_k_subgraphs_parellel(tdg_with_in_labels, 12)
In [76]:
%time _x = enumerate_all_size_k_subgraphs(tdg_with_in_labels, 12)
In [77]:
print_source(enumerate_all_subgraphs_upto_size_k_parallel)
Out[77]:
In [78]:
%time _result = enumerate_all_subgraphs_upto_size_k_parallel(merged_graph, 3)
In [79]:
%time _result = enumerate_all_subgraphs_upto_size_k(merged_graph, 3)
In [80]:
%time _result = enumerate_all_subgraphs_upto_size_k_parallel(merged_graph, 5)
In [81]:
%time _result = enumerate_all_subgraphs_upto_size_k_parallel(merged_graph, 5)
In [82]:
%time _result = enumerate_all_subgraphs_upto_size_k(merged_graph, 5)
In [ ]:
# %time _result = enumerate_all_subgraphs_upto_size_k_parallel(merged_graph, 10) # takes too long
In [ ]:
# %time _result = enumerate_all_subgraphs_upto_size_k_naive(merged_graph, 3)
In [ ]:
len(merged_graph)
In [ ]:
len(merged_graph) ** 3
In [ ]:
from math import factorial
def possible_combinations(num_of_elements, k):
# https://en.wikipedia.org/wiki/Combination
if k > num_of_elements:
return 0
else:
return factorial(num_of_elements) / ( factorial(k) * factorial((num_of_elements - k)) )
In [ ]:
def possible_combinations_up_to_k(num_of_elements, k):
assert 1 <= k <= num_of_elements
result = 0
for i in xrange(1, k+1):
result += possible_combinations(num_of_elements, i)
return result
In [ ]:
possible_combinations_up_to_k(356, 2)
In [ ]:
possible_combinations_up_to_k(356, 3)
In [ ]:
possible_combinations_up_to_k(356, 4)
In [ ]:
possible_combinations_up_to_k(356, 10)
In [20]:
import shelve
def enumerate_all_subgraphs_upto_size_k_shelved(document_graph, k):
"""
"""
shelve_name = '/tmp/{}-k-{}.db'.format(document_graph.name, k)
# writeback is important for nested dicts etc.)
subgraph_shelve = shelve.open(shelve_name, writeback=True)
subgraph_shelve['graph'] = document_graph
document_nodes = len(document_graph)
if k > document_nodes:
k = document_nodes
int_graph = nx.convert_node_labels_to_integers(nx.DiGraph(document_graph),
first_label=1,
label_attribute='node_id')
try:
for i in xrange(1, k+1):
enumerate_all_size_k_subgraphs_shelved(int_graph, i, subgraph_shelve)
finally:
subgraph_shelve.close() # this calls .sync() as well
return shelve_name
def enumerate_all_size_k_subgraphs_shelved(graph, k, graph_shelve):
"""
shelve : shelve
an open shelve to store the subgraphs in
Returns
-------
num_of_k_subgraphs : the number of subgraphs of size-k generated
"""
k_shelve = graph_shelve[str(k)] = dict()
assert all(isinstance(node, int) for node in graph.nodes_iter())
if not 1 <= k <= len(graph):
return 0
num_of_k_subgraphs = 1
for node in graph.nodes_iter():
extension = {neighbor for neighbor in graph.neighbors(node)
if neighbor > node}
subgraphs = extend_subgraph(graph, k, {node}, extension, node)
if isinstance(subgraphs, list):
for subgraph in subgraphs:
k_shelve[str(num_of_k_subgraphs)] = subgraph
num_of_k_subgraphs += 1
else: # isinstance(subgraphs, nx.Graph)
k_shelve[str(num_of_k_subgraphs)] = subgraphs
num_of_k_subgraphs += 1
return num_of_k_subgraphs
In [63]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 3)
Out[63]:
In [64]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 4)
Out[64]:
In [65]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 5)
Out[65]:
In [67]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 6)
Out[67]:
In [68]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 7)
Out[68]:
In [21]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 8)
Out[21]:
In [22]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 9)
Out[22]:
In [21]:
%time enumerate_all_subgraphs_upto_size_k_shelved(merged_graph, 10) # ca. 12mins, started: 1714
Out[21]:
In [1]:
# import shelve
# k10_shelve = shelve.open('/tmp/maz-1423.xml-k-10.db')
In [2]:
k10_shelve.keys()
Out[2]:
for key in k10_shelve.keys():
print key, len(k10_shelve[key])
graph 356
1 356
2 234
3 486
4 1182
5 2884
6 7445
7 21355
8 66773
9 220405
10 752187