In [27]:
def k_suffix_kernel(k, s, t):
    """
    calculates k-suffix kernel of input strings s and t::
    
        K_{k}^{S}(s,t)=
        \begin{cases}
        1 \text{ if } s=s_{1}u, t=t_{1}u, \text{ for } u \in \Sigma^{k} & \\
        0 \textrm{ otherwise}
        \end{cases}
        
        (Shawe-Taylor and Cristianini 2004, p.349)
    
    Parameters
    ----------
    k : int
        suffix length
    s : str
        input string 1
    t : str
        input string 2
    
    Returns
    -------
    product : int
        returns 1, iff the strings s and t have the same suffix (of length k).
        otherwise, returns 0.
    """
    assert min(len(s), len(t)) >= k, \
        "strings must be at least as long as the given suffix length k"
    s_suffix = s[-k:] # suffix of length k
    t_suffix = t[-k:]
    return 1 if s_suffix == t_suffix else 0

In [28]:
k_suffix_kernel(3, 'cat', 'cat')


Out[28]:
1

In [29]:
def p_spectrum_kernel(p, s, t):
    """
    calculates the inner product of the p-spectra of
    the strings s and t.
    
    The p-spectrum of a sequence is defined as the
    histogram of frequencies of all its (contiguous)
    substrings of length p::
    
        \sum\limits^{|s|-p+1}_{i=1} \sum\limits^{|t|-p+1}_{j=1} K^{S}_{p}(s(i:i+p), t(j:j+p))
    
        (Shawe-Taylor and Cristianini 2004, p.349)
    
    Paramters
    ---------
    p : int
        length of contiguous substrings to be found
    s : str
        input string 1
    t : str
        input string 2       
    """
    result = 0
    for i in xrange(len(s)-p+1):
        for j in xrange(len(s)-p+1):
            result += k_suffix_kernel(p, s[i:i+p], t[j:j+p])
    return result

In [30]:
p_spectrum_kernel(2, 'cat', 'cat')


Out[30]:
2

In [31]:
p_spectrum_kernel(3, 'statistics', 'computation')


Out[31]:
2

In [32]:
p_spectrum_kernel(1, 'cat', 'cat')


Out[32]:
3

In [33]:
p_spectrum_kernel(2, 'cat', 'cat')


Out[33]:
2

In [34]:
p_spectrum_kernel(3, 'cat', 'cat')


Out[34]:
1

In [35]:
p_spectrum_kernel(4, 'cat', 'cat')


Out[35]:
0

function [result] = blended_spectrum_bf(s,t,p) %BLENDED_SPECTRUM_BF % -Finds the contiguous subsequence match count between strings s and t % by using a brute-force approach, for all substrings of length <= p. % -This program does not take into account gap penalties. % *(There is also a dynamic programming implementation of this algorithm. % Type help blended_spectrum for info.) % % Kp = [Summation of h from 1 to p] % [Summation of i from 1 to |s|-p+1] % [Summation of j from 1 to |t|-p+1] % delta(s(i:i+p+1),t(j:j+p+1)), % % (where delta is the identity function, which returns % 1 if the arguments are equal, and 0 otherwise.) % % -Example: blended_spectrum_bf('abccc','abc', 2) returns a value of 7. % (Note that blended_spectrum_bf('abccc','abc',2)=blended_spectrum_bf('abc','abccc',2) since K(s,t,p) = K(t,s,p) ). % -Example: blended_spectrum_bf('a','a', 1) returns a value of 1. % -Example: blended_spectrum_bf('a','a', 2) returns a value of 4. % -Example: blended_spectrum_bf('a','b', 1) returns a value of 0. % -Example: blended_spectrum_bf('ab','ab', 1) returns a value of 2.

%------------------------------------------------------------------------------------------

%Obtain lengths of strings [num_rows_s, n] = size(s); [num_rows_t, m] = size(t);

%Initialize result variable result = 0;

%Error checking statements: %Make sure input vectors are horizontal. if (num_rows_s ~= 1 | num_rows_t ~= 1)
error('Error: s and t must be horizontal vectors.'); end;

%If p is less than zero or not a number, program should quit due to faulty variable input. if p <= 0 | ischar(p) error('Error: p needs to be a number greater than 0.'); end; %End of error checking

%Implement the algorithm with double 'for' loops. for h=1:p for i=1:(n-h+1) for j=1:(m-h+1)
result = result + strcmpi( s(i:(i+h-1)), t(j:(j+h-1)) ); end; end; end;


In [36]:
from functools32 import lru_cache

def bruteforce_blended_spectrum_kernel(s, t, p):
    """
    returns the number of contiguous subsequences/substrings
    between strings s and t for all substrings of length <= p.
    
    Note: this (bruteforce) version does not consider
    gap penalties (i.e. there's no lambda decay).
    
    Examples
    --------
    >>> bruteforce_blended_spectrum_kernel('a', 'a', 1)
    1
    >>> bruteforce_blended_spectrum_kernel('a', 'b', 1)
    0
    >>> bruteforce_blended_spectrum_kernel('ab', 'ab', 1)
    2
    >>> bruteforce_blended_spectrum_kernel('abccc', 'abc', 2)
    7
    """
    def delta(s, t):
        """identity function."""
        return 1 if s == t else 0
    
    result = 0
    for h in xrange(p):
        for i in xrange(len(s)-h):
            for j in xrange(len(t)-h):
                result += delta(s[i:i+h+1], t[j:j+h+1])
    return result

In [37]:
s = """
In mathematics, computer science, economics, and bioinformatics,
dynamic programming is a method for solving complex problems
by breaking them down into simpler subproblems."""

t = """
It is applicable to problems exhibiting the properties of
overlapping subproblems[1] and optimal substructure
(described below)."""

In [67]:
from functools32 import lru_cache

@lru_cache
def p_suffix_kernel(s, t, p, lambda_weight):
    """
    evalutates the similarity of of the suffixes of the given
    input strings s and t.
    """
#     print "p_suffix_kernel({}, {}, {})".format(s, t, p)
    if p == 0:
        return 0
    # if s and t share a suffix of length p
    if s[-p:] == t[-p:]:
        # evaluate p-suffix kernel recursively on the remaining front portin of the strings
        # (i.e. the 'head' in Prolog terminology or 'car' in Lisp)
        return lambda_weight**2 * (1 + p_suffix_kernel(s[:-p], t[:-p], p-1, lambda_weight))
    else:
        return 0

In [39]:
p_suffix_kernel('verscheuchen', 'verseuchen', 3, 1)


p_suffix_kernel(verscheuchen, verseuchen, 3)
p_suffix_kernel(verscheuc, verseuc, 2)
p_suffix_kernel(versche, verse, 1)
p_suffix_kernel(versch, vers, 0)
Out[39]:
3

In [62]:
def blended_spectrum_kernel(s, t, p, lambda_weight=1):
    """
    blended version of the p-spectrum kernel,
    
    which considers all spectra in the range
    1 <= d <= p and weights them according to lamda_weigth^d.
    The calculation works just like the p-spectrum kernel,
    but uses the p-suffix instead of the k-suffix kernel::
    
        \tilde{K_{p}} =
            \sum\limits^{|s|-p+1}_{i=1}
            \sum\limits^{|t|-p+1}_{j=1}
                \tilde{K}^{S}_{p}(s(i:i+p), t(j:j+p))
    
        (Shawe-Taylor and Cristianini 2004, p.350f)
    
    Paramters
    ---------
    s : str
        input string 1
    t : str
        input string 2       
    p : int
        length of contiguous substrings to be found
    lambda_weight : int
       weight common suffixes according to their length
    """
    result = 0
    for i in xrange(len(s)-p+1):
        for j in xrange(len(s)-p+1):
            result += p_suffix_kernel(s[i:i+p], t[j:j+p], p, lambda_weight)
    return result

In [44]:
print bruteforce_blended_spectrum_kernel('a', 'a', 1)
    print bruteforce_blended_spectrum_kernel('a', 'b', 1)
    print bruteforce_blended_spectrum_kernel('ab', 'ab', 1)
    print bruteforce_blended_spectrum_kernel('abccc', 'abc', 2)


1
0
2
7

In [69]:
blended_spectrum_kernel(s, t, 2)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-69-40d50f38a027> in <module>()
----> 1 blended_spectrum_kernel(s, t, 2)

<ipython-input-62-8b54a6830e04> in blended_spectrum_kernel(s, t, p, lambda_weight)
     29     for i in xrange(len(s)-p+1):
     30         for j in xrange(len(s)-p+1):
---> 31             result += p_suffix_kernel(s[i:i+p], t[j:j+p], p, lambda_weight)
     32     return result

TypeError: unsupported operand type(s) for +=: 'int' and 'function'