In [2]:
def SSK(lamb, p):
"""Return subsequence kernel"""
cache = {} # added to make code work // AN
def SSKernel(xi,xj,lamb,p):
"""
It's not commented but, xi and xj are strings,
lamb and p are parameters described in the paper.
It also caches already seen pairs of xi and xj.
"""
mykey = (xi, xj) if xi>xj else (xj, xi)
if not mykey in cache:
dps = []
for i in xrange(len(xi)):
dps.append([lamb**2 if xi[i] == xj[j] else 0 for j in xrange(len(xj))])
dp = []
for i in xrange(len(xi)+1):
dp.append([0]*(len(xj)+1))
k = [0]*(p+1)
for l in xrange(2, p + 1):
for i in xrange(len(xi)):
for j in xrange(len(xj)):
dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
if xi[i] == xj[j]:
dps[i][j] = lamb**2 * dp[i][j]
k[l] = k[l] + dps[i][j]
cache[mykey] = k[p]
return cache[mykey]
return lambda xi, xj: SSKernel(xi,xj,lamb,p)/(SSKernel(xi,xi,lamb,p) * SSKernel(xj,xj,lamb,p))**0.5
In [80]:
def SSKernel(xi,xj,lamb,p, cache=None):
"""
decay factor λ ∈ (0, 1)
It's not commented but, xi and xj are strings,
lamb and p are parameters described in the paper.
It also caches already seen pairs of xi and xj.
"""
if not cache:
cache = {}
mykey = (xi, xj) if xi>xj else (xj, xi)
if not mykey in cache:
dps = []
for i in xrange(len(xi)):
dps_line = [lamb**2 if xi[i] == xj[j] else 0 for j in xrange(len(xj))]
dps.append(dps_line)
dp = []
for i in xrange(len(xi)+1):
dp.append([0]*(len(xj)+1))
k = [0]*(p+1)
for l in xrange(2, p + 1):
for i in xrange(len(xi)):
for j in xrange(len(xj)):
dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
if xi[i] == xj[j]:
dps[i][j] = lamb**2 * dp[i][j]
k[l] = k[l] + dps[i][j]
cache[mykey] = k[p]
return cache[mykey], dp, dps
In [5]:
kant1, kant2 = "science is organized knowledge", "wisdom is organized life"
In [87]:
def assk(s, t, lamb, p, cache=None):
"""
decay factor λ ∈ (0, 1)
It's not commented but, s and t are strings,
lamb and p are parameters described in the paper.
It also caches already seen pairs of s and xj.
"""
if not cache:
cache = {}
mykey = (s, t) if s>t else (t, s)
if not mykey in cache:
dps = numpy.zeros( (len(s), len(t)) )
for i, s_i in enumerate(s):
for j, t_j in enumerate(t):
if s_i == t_j:
dps[i][j] = lamb**2
dp = numpy.zeros( (len(s)+1, len(t)+1) )
k = numpy.zeros(p+1)
for l in xrange(2, p+1):
for i, s_i in enumerate(s):
for j, t_j in enumerate(t):
dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
if s_i == t_j:
dps[i][j] = lamb**2 * dp[i][j]
k[l] = k[l] + dps[i][j]
cache[mykey] = k[p]
return cache[mykey], dp, dps
In [84]:
from repoze.lru import lru_cache
# @lru_cache(500)
def all_subsequences_kernel(s, t, tab=0, calltype='first'):
"""
Shawe-Taylor and Cristianini (2004, p. 353f)
"""
print '{}{} - ask({}, {})'.format('\t'*tab, calltype, s, t)
# if s or t are empty strings
if not s or not t:
print "{}return: 1 ({})".format('\t'*tab, calltype)
return 1 # each string contains the empty string by definition
else:
s_head, s_tail = s[:-1], s[-1]
print '{}head: {}, tail: {}'.format('\t'*tab, s_head, s_tail)
result = 0
for k, tk in enumerate(t):
print '{}k: {}, t[k]: {}'.format('\t'*(tab+1), k, tk)
if t[k] == s_tail:
result += all_subsequences_kernel(s_head, t[:k], tab+1, calltype='recursive')
return_val = all_subsequences_kernel(s_head, t, tab, calltype='last') + result
print "{}return: {} ({})".format('\t'*tab, return_val, calltype)
return return_val
In [87]:
all_subsequences_kernel('gatt', 'cat')
Out[87]:
In [30]:
t = 'abcde'
for k, t_i in enumerate(t, 1):
print k, t_i
print "\t", t[:k-1]
In [32]:
t = 'abcde'
for k, t_i in enumerate(t):
print k, t_i
print "\t", t[:k]