String subsequence kernel

Source: http://metaoptimize.com/qa/questions/4262/string-kernel-implementation
Author: Rok Mocnik



In [2]:

    
def SSK(lamb, p):
        """Return subsequence kernel"""
        cache = {} # added to make code work // AN
        def SSKernel(xi,xj,lamb,p):
            """
            It's not commented but, xi and xj are strings,
            lamb and p are parameters described in the paper.
            
            It also caches already seen pairs of xi and xj.
            """
            mykey = (xi, xj) if xi>xj else (xj, xi)
            if not mykey in cache:
                dps = []
                for i in xrange(len(xi)):
                    dps.append([lamb**2 if xi[i] == xj[j] else 0 for j in xrange(len(xj))])
                dp = []
                for i in xrange(len(xi)+1):
                    dp.append([0]*(len(xj)+1))
                k = [0]*(p+1)
                for l in xrange(2, p + 1):
                    for i in xrange(len(xi)):
                        for j in xrange(len(xj)):
                            dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
                            if xi[i] == xj[j]:
                                dps[i][j] = lamb**2 * dp[i][j]
                                k[l] = k[l] + dps[i][j]
                cache[mykey] = k[p]
            return cache[mykey]
        return lambda xi, xj: SSKernel(xi,xj,lamb,p)/(SSKernel(xi,xi,lamb,p) * SSKernel(xj,xj,lamb,p))**0.5



In [80]:

    
def SSKernel(xi,xj,lamb,p, cache=None):
    """
    decay factor λ ∈ (0, 1)
    
    It's not commented but, xi and xj are strings,
    lamb and p are parameters described in the paper.

    It also caches already seen pairs of xi and xj.
    """
    if not cache:
        cache = {}
    mykey = (xi, xj) if xi>xj else (xj, xi)
    if not mykey in cache:
        dps = []
        for i in xrange(len(xi)):
            dps_line = [lamb**2 if xi[i] == xj[j] else 0 for j in xrange(len(xj))]
            dps.append(dps_line)
        dp = []
        for i in xrange(len(xi)+1):
            dp.append([0]*(len(xj)+1))
        k = [0]*(p+1)
        for l in xrange(2, p + 1):
            for i in xrange(len(xi)):
                for j in xrange(len(xj)):
                    dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
                    if xi[i] == xj[j]:
                        dps[i][j] = lamb**2 * dp[i][j]
                        k[l] = k[l] + dps[i][j]
        cache[mykey] = k[p]
    return cache[mykey], dp, dps



In [5]:

    
kant1, kant2 = "science is organized knowledge", "wisdom is organized life"



In [87]:

    
def assk(s, t, lamb, p, cache=None):
    """
    decay factor λ ∈ (0, 1)
    
    It's not commented but, s and t are strings,
    lamb and p are parameters described in the paper.

    It also caches already seen pairs of s and xj.
    """
    if not cache:
        cache = {}
    mykey = (s, t) if s>t else (t, s)

    if not mykey in cache:
        dps = numpy.zeros( (len(s), len(t)) )
        for i, s_i in enumerate(s):
            for j, t_j in enumerate(t):
                if s_i == t_j:
                    dps[i][j] = lamb**2

        dp = numpy.zeros( (len(s)+1, len(t)+1) )
        k = numpy.zeros(p+1)

        for l in xrange(2, p+1):
            for i, s_i in enumerate(s):
                for j, t_j in enumerate(t):
                    dp[i+1][j+1] = dps[i][j] + lamb * dp[i][j+1] + lamb * dp[i+1][j] - lamb**2 * dp[i][j]
                    if s_i == t_j:
                        dps[i][j] = lamb**2 * dp[i][j]
                        k[l] = k[l] + dps[i][j]
        
        cache[mykey] = k[p]                 
    return cache[mykey], dp, dps



In [84]:

    
from repoze.lru import lru_cache

# @lru_cache(500)
def all_subsequences_kernel(s, t, tab=0, calltype='first'):
    """
    Shawe-Taylor and Cristianini (2004, p. 353f)
    """
    print '{}{} - ask({}, {})'.format('\t'*tab, calltype, s, t)
    
    # if s or t are empty strings
    if not s or not t:
        print "{}return: 1 ({})".format('\t'*tab, calltype)
        return 1  # each string contains the empty string by definition
    else:
        s_head, s_tail = s[:-1], s[-1]
        print '{}head: {}, tail: {}'.format('\t'*tab, s_head, s_tail)
        result = 0
        for k, tk in enumerate(t):
            print '{}k: {}, t[k]: {}'.format('\t'*(tab+1), k, tk)
            if t[k] == s_tail:
                result += all_subsequences_kernel(s_head, t[:k], tab+1, calltype='recursive')
    return_val = all_subsequences_kernel(s_head, t, tab, calltype='last') + result
    print "{}return: {} ({})".format('\t'*tab, return_val, calltype)
    return return_val



In [87]:

    
all_subsequences_kernel('gatt', 'cat')









    



first - ask(gatt, cat)
head: gat, tail: t
	k: 0, t[k]: c
	k: 1, t[k]: a
	k: 2, t[k]: t
	recursive - ask(gat, ca)
	head: ga, tail: t
		k: 0, t[k]: c
		k: 1, t[k]: a
	last - ask(ga, ca)
	head: g, tail: a
		k: 0, t[k]: c
		k: 1, t[k]: a
		recursive - ask(g, c)
		head: , tail: g
			k: 0, t[k]: c
		last - ask(, c)
		return: 1 (last)
		return: 1 (recursive)
	last - ask(g, ca)
	head: , tail: g
		k: 0, t[k]: c
		k: 1, t[k]: a
	last - ask(, ca)
	return: 1 (last)
	return: 1 (last)
	return: 2 (last)
	return: 2 (recursive)
last - ask(gat, cat)
head: ga, tail: t
	k: 0, t[k]: c
	k: 1, t[k]: a
	k: 2, t[k]: t
	recursive - ask(ga, ca)
	head: g, tail: a
		k: 0, t[k]: c
		k: 1, t[k]: a
		recursive - ask(g, c)
		head: , tail: g
			k: 0, t[k]: c
		last - ask(, c)
		return: 1 (last)
		return: 1 (recursive)
	last - ask(g, ca)
	head: , tail: g
		k: 0, t[k]: c
		k: 1, t[k]: a
	last - ask(, ca)
	return: 1 (last)
	return: 1 (last)
	return: 2 (recursive)
last - ask(ga, cat)
head: g, tail: a
	k: 0, t[k]: c
	k: 1, t[k]: a
	recursive - ask(g, c)
	head: , tail: g
		k: 0, t[k]: c
	last - ask(, c)
	return: 1 (last)
	return: 1 (recursive)
	k: 2, t[k]: t
last - ask(g, cat)
head: , tail: g
	k: 0, t[k]: c
	k: 1, t[k]: a
	k: 2, t[k]: t
last - ask(, cat)
return: 1 (last)
return: 1 (last)
return: 2 (last)
return: 4 (last)
return: 6 (first)






    Out[87]:





6



In [30]:

    
t = 'abcde'
for k, t_i in enumerate(t, 1):
    print k, t_i
    print "\t", t[:k-1]









    



1 a
	
2 b
	a
3 c
	ab
4 d
	abc
5 e
	abcd



In [32]:

    
t = 'abcde'
for k, t_i in enumerate(t):
    print k, t_i
    print "\t", t[:k]









    



0 a
	
1 b
	a
2 c
	ab
3 d
	abc
4 e
	abcd