https://www.toptal.com/machine-learning/structured-data-tree-kernels
In [1]:
s1 = "abba ist"
s2 = "abbas christlich oder abbationalist?"
In [2]:
def unique_substrings(string):
"""Return a set of all unique substrings of a given string.
The result DOES NOT include the empty string.
"""
str_len = len(string)
return {string[i:j+1] for i in range(str_len) for j in range(i, str_len)}
In [3]:
from collections import Counter
def substring_counts(string):
"""Find all substrings in the given string. Return a dict mapping
from substrings to the number of their occurrences in the string.
"""
counts = Counter()
str_len = len(string)
for i in range(str_len):
for j in range(i, str_len):
counts[string[i:j+1]] += 1
return counts
In [4]:
def all_common_substrings(s1, s2):
"""Returns a set of all substrings (incl. overlapping substrings)
that occur in both input strings.
"""
shortest_string = s1 if len(s1) <= len(s2) else s2
min_len = len(shortest_string)
substrings = unique_substrings(shortest_string)
return {sub for sub in substrings if sub in s1 and sub in s2}
In [5]:
def common_substrings_with_counts(s1, s2):
"""Return three things:
- a set of all substrings occuring in both input strings
- dict mapping from substrings in s1 to the number of their occurrences
- dict mapping from substrings in s2 to the number of their occurrences
"""
shortest_string = s1 if len(s1) <= len(s2) else s2
min_len = len(shortest_string)
s1_counts = substring_counts(s1)
s2_counts = substring_counts(s2)
return set(s1_counts).intersection(set(s2_counts)), s1_counts, s2_counts
In [6]:
common_subs, s1_counts, s2_counts = common_substrings_with_counts(s1, s2)
In [7]:
def string_kernel(s1, s2):
common_subs, s1_counts, s2_counts = common_substrings_with_counts(s1, s2)
return sum(s1_counts[common_sub] * s2_counts[common_sub]
for common_sub in common_subs)
In [8]:
from nltk.tree import ParentedTree
In [9]:
ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))')
In [10]:
print(ptree)
(VP (VERB saw) (NP (DET the) (NOUN dog)))
In [11]:
for n in ptree:
print(type(n), n)
(<class 'nltk.tree.ParentedTree'>, ParentedTree('VERB', ['saw']))
(<class 'nltk.tree.ParentedTree'>, ParentedTree('NP', [ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])]))
In [12]:
ptree.label()
Out[12]:
'VP'
In [13]:
ptree.treeposition()
Out[13]:
()
In [14]:
for st in ptree.subtrees():
print st
(VP (VERB saw) (NP (DET the) (NOUN dog)))
(VERB saw)
(NP (DET the) (NOUN dog))
(DET the)
(NOUN dog)
In [15]:
print type(ptree[0][0])
<type 'str'>
In [16]:
ptree.leaves()
Out[16]:
['saw', 'the', 'dog']
In [17]:
import discoursegraphs as dg
import os
%load_ext gvmagic
dis_file = os.path.join(dg.DATA_ROOT_DIR, 'rst-example1.dis')
ddg = dg.read_dis(dis_file)
In [18]:
%dotstr dg.print_dot(ddg)
In [19]:
ddg.dis_tree
Out[19]:
In [20]:
import nltk
In [21]:
nltk.__version__
Out[21]:
'3.2.5'
In [ ]:
In [22]:
ddg.dis_tree._repr_png_()
Out[22]:
u''
In [23]:
%dotstr dg.print_dot(dg.read_dis("/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-double-1.0/wsj_1114.out.dis", tokenize=False))
In [24]:
%dotstr dg.print_dot(dg.read_dis("/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-double-1.0/wsj_0633.out.dis", tokenize=False))
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-24-68dda545ec1e> in <module>()
----> 1 get_ipython().magic(u'dotstr dg.print_dot(dg.read_dis("/home/arne/corpora/rst_discourse_treebank/data/RSTtrees-WSJ-double-1.0/wsj_0633.out.dis", tokenize=False))')
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2156 magic_name, _, magic_arg_s = arg_s.partition(' ')
2157 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158 return self.run_line_magic(magic_name, magic_arg_s)
2159
2160 #-------------------------------------------------------------------------
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2077 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2078 with self.builtin_trap:
-> 2079 result = fn(*args,**kwargs)
2080 return result
2081
<decorator-gen-357> in dotstr(self, line)
/usr/local/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/usr/local/lib/python2.7/dist-packages/gvmagic.pyc in dotstr(self, line)
51 @line_magic
52 def dotstr(self, line):
---> 53 self._from_str(line, 'dot')
54
55 @line_magic
/usr/local/lib/python2.7/dist-packages/gvmagic.pyc in _from_str(self, line, layout_engine)
151
152 def _from_str(self, line, layout_engine):
--> 153 s = self.shell.ev(line)
154 data = run_graphviz(s, layout_engine)
155 if data:
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in ev(self, expr)
2427 """
2428 with self.builtin_trap:
-> 2429 return eval(expr, self.user_global_ns, self.user_ns)
2430
2431 def safe_execfile(self, fname, *where, **kw):
<string> in <module>()
/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in __init__(self, dis_filepath, name, namespace, tokenize, precedence)
76 self.tokens = []
77 self.dis_tree = self.disfile2tree(dis_filepath)
---> 78 self.parse_dis_tree(self.dis_tree)
79
80 if precedence:
/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_dis_tree(self, dis_tree, indent)
110 edge_type=EdgeTypes.dominance_relation)
111
--> 112 self.parse_dis_tree(child, indent=indent+1)
113
114 else: # tree_type in ('Nucleus', 'Satellite')
/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_dis_tree(self, dis_tree, indent)
175
176 for child in children:
--> 177 self.parse_dis_tree(child, indent=indent+1)
178
179 def get_child_types(self, children):
/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/readwrite/rst/dis.pyc in parse_dis_tree(self, dis_tree, indent)
157 else:
158 assert tree_type == 'Satellite'
--> 159 raise NotImplementedError("I don't know how to combine two satellites")
160
161 elif len(child_types['Satellite']) == 1 and len(child_types['Nucleus']) == 1:
NotImplementedError: I don't know how to combine two satellites
In [ ]:
Content source: arne-cl/alt-mulig
Similar notebooks: