In [39]:
import itertools
import networkx as nx
from networkx import dfs_tree
from networkx.algorithms import isomorphism as iso
# install with: sudo pip install git+http://github.com/chebee7i/nxpd/#egg=nxpd
from nxpd import draw
from discoursekernels.tree import (
generate_all_unique_subtrees, contains_only_complete_productions,
get_subtrees, count_tree_fragment_occurances,
get_production_rules, common_subtrees,
tree_kernel_naive, tree_kernel_polynomial, find_all_common_subtrees_bruteforce)
from discoursekernels.tree import (
is_rooted_at_node, is_leave, is_proper, is_treefragment)
from discoursekernels.test_tree import (example_tree, tree_jeff_ate_cookies,
tree_alex_died, tree_steve_ate_bananas, tree_the_man_drank_wine,
tree_the_man_killed_the_woman, tree_fragment_alex, tree_fragment_npn)
from discoursekernels.util import print_source, label_nodes, draw_multiple_graphs
In [3]:
print_source(generate_all_unique_subtrees)
Out[3]:
In [4]:
unique_subtrees = generate_all_unique_subtrees(tree_alex_died, tree_jeff_ate_cookies,
tree_steve_ate_bananas, tree_the_man_drank_wine)
In [5]:
# NOTE: this doesn't count _how often_ the i-th component occurs in the tree, but only says
# if it occurs in the tree at all
tree_alex_died_vector = [int(is_treefragment(tree_alex_died, ust)) for ust in unique_subtrees]
function $h_i(T)$ counts how often the i-th tree fragment occurs in tree T
T is now represented as:
In [6]:
print_source(is_rooted_at_node)
Out[6]:
In [7]:
print_source(count_tree_fragment_occurances)
Out[7]:
In [8]:
count_tree_fragment_occurances(tree_alex_died, tree_alex_died)
Out[8]:
In [9]:
npdn = nx.DiGraph()
npdn.add_nodes_from(label_nodes([ (1, 'NP'), (2, 'D'), (3, 'N') ]))
npdn.add_edges_from([ (1, 2), (1, 3) ])
In [10]:
count_tree_fragment_occurances(tree=tree_the_man_drank_wine, subtree=npdn)
Out[10]:
In [11]:
count_tree_fragment_occurances(tree=tree_the_man_killed_the_woman, subtree=npdn)
Out[11]:
function $C(n_1, n_2)$ simply counts the number of common subtrees rooted at both $n_1$ and $n_2$ and is defined as $\sum_i I_i(n_1) I_i(n_2)$
NOTE: $C(n_1, n_2)$ can be computed in polynomial time:
In [12]:
print_source(common_subtrees)
Out[12]:
In [13]:
print_source(tree_kernel_naive)
Out[13]:
In [14]:
print_source(tree_kernel_polynomial)
Out[14]:
In [15]:
print_source(find_all_common_subtrees_bruteforce)
Out[15]:
In [16]:
%timeit tree_kernel_naive(tree_the_man_drank_wine, tree_the_man_killed_the_woman)
In [17]:
%timeit tree_kernel_polynomial(tree_the_man_drank_wine, tree_the_man_killed_the_woman)
In [18]:
%timeit find_all_common_subtrees_bruteforce(tree_the_man_drank_wine, tree_the_man_killed_the_woman)
In [19]:
tree_kernel_naive(tree_alex_died, tree_alex_died) \
== tree_kernel_polynomial(tree_alex_died, tree_alex_died) \
== len(find_all_common_subtrees_bruteforce(tree_alex_died, tree_alex_died))
Out[19]:
In [20]:
from IPython.display import Image
In [21]:
Image('img/a-dog.png')
Out[21]:
In [22]:
Image('img/a-cat.png')
Out[22]:
3 out of 5 structures are identical, therefore the similarity is equal to 3.
In [43]:
a_dog = nx.DiGraph()
a_dog.add_nodes_from(label_nodes([(1, 'NP'), (2, 'D'), (3, 'N'), (4, 'a'), (5, 'dog')]))
a_dog.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 5)])
a_cat = nx.DiGraph()
a_cat.add_nodes_from(label_nodes([(1, 'NP'), (2, 'D'), (3, 'N'), (4, 'a'), (5, 'cat')]))
a_cat.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 5)])
In [48]:
draw_multiple_graphs([a_dog, a_cat])
In [51]:
# draw_multiple_graphs(generate_all_unique_subtrees(a_dog, a_cat))
In [24]:
# print tree_kernel(tree1, tree2)
# print tree_kernel_recursive(tree1, tree2)
In [25]:
tree3 = nx.DiGraph()
tree3.add_edges_from([('D', 'a')])
# print tree_kernel(tree3, tree3)
# print tree_kernel_recursive(tree3, tree3)
In [26]:
[s.edges() for s in get_subtrees(tree1)]
Out[26]:
In [27]:
draw(tree_jeff_ate_cookies, show='ipynb')
Out[27]:
In [28]:
draw(tree_steve_ate_bananas, show='ipynb')
Out[28]:
In [29]:
draw(tree_alex_died, show='ipynb')
Out[29]:
In [30]:
draw(tree_the_man_drank_wine, show='ipynb')
Out[30]:
In [31]:
draw(tree_the_man_killed_the_woman, show='ipynb')
Out[31]:
caveat: it might produce subtrees we don't want
e.g. if tree contains both NP -> N
and NP -> D N
rules
the algorithm will produce an NP -> N -> Noun
subtree even from
a NP -> D N
subtree
In [32]:
get_production_rules(tree_the_man_drank_wine, node_attrib='label')
Out[32]:
In [33]:
len(list(get_subtrees(tree_the_man_drank_wine, node_attrib='label')))
Out[33]:
In [34]:
len(list(get_subtrees(tree_the_man_drank_wine)))
Out[34]:
In [35]:
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_jeff_ate_cookies))
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_jeff_ate_cookies, node_attrib='label'))
In [36]:
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_steve_ate_bananas))
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_steve_ate_bananas, node_attrib='label'))
In [37]:
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_the_man_drank_wine))
print len(find_all_common_subtrees_bruteforce(tree_jeff_ate_cookies, tree_the_man_drank_wine, node_attrib='label'))