In [27]:
def k_suffix_kernel(k, s, t):
"""
calculates k-suffix kernel of input strings s and t::
K_{k}^{S}(s,t)=
\begin{cases}
1 \text{ if } s=s_{1}u, t=t_{1}u, \text{ for } u \in \Sigma^{k} & \\
0 \textrm{ otherwise}
\end{cases}
(Shawe-Taylor and Cristianini 2004, p.349)
Parameters
----------
k : int
suffix length
s : str
input string 1
t : str
input string 2
Returns
-------
product : int
returns 1, iff the strings s and t have the same suffix (of length k).
otherwise, returns 0.
"""
assert min(len(s), len(t)) >= k, \
"strings must be at least as long as the given suffix length k"
s_suffix = s[-k:] # suffix of length k
t_suffix = t[-k:]
return 1 if s_suffix == t_suffix else 0
In [28]:
k_suffix_kernel(3, 'cat', 'cat')
Out[28]:
In [29]:
def p_spectrum_kernel(p, s, t):
"""
calculates the inner product of the p-spectra of
the strings s and t.
The p-spectrum of a sequence is defined as the
histogram of frequencies of all its (contiguous)
substrings of length p::
\sum\limits^{|s|-p+1}_{i=1} \sum\limits^{|t|-p+1}_{j=1} K^{S}_{p}(s(i:i+p), t(j:j+p))
(Shawe-Taylor and Cristianini 2004, p.349)
Paramters
---------
p : int
length of contiguous substrings to be found
s : str
input string 1
t : str
input string 2
"""
result = 0
for i in xrange(len(s)-p+1):
for j in xrange(len(s)-p+1):
result += k_suffix_kernel(p, s[i:i+p], t[j:j+p])
return result
In [30]:
p_spectrum_kernel(2, 'cat', 'cat')
Out[30]:
In [31]:
p_spectrum_kernel(3, 'statistics', 'computation')
Out[31]:
In [32]:
p_spectrum_kernel(1, 'cat', 'cat')
Out[32]:
In [33]:
p_spectrum_kernel(2, 'cat', 'cat')
Out[33]:
In [34]:
p_spectrum_kernel(3, 'cat', 'cat')
Out[34]:
In [35]:
p_spectrum_kernel(4, 'cat', 'cat')
Out[35]:
function [result] = blended_spectrum_bf(s,t,p) %BLENDED_SPECTRUM_BF % -Finds the contiguous subsequence match count between strings s and t % by using a brute-force approach, for all substrings of length <= p. % -This program does not take into account gap penalties. % *(There is also a dynamic programming implementation of this algorithm. % Type help blended_spectrum for info.) % % Kp = [Summation of h from 1 to p] % [Summation of i from 1 to |s|-p+1] % [Summation of j from 1 to |t|-p+1] % delta(s(i:i+p+1),t(j:j+p+1)), % % (where delta is the identity function, which returns % 1 if the arguments are equal, and 0 otherwise.) % % -Example: blended_spectrum_bf('abccc','abc', 2) returns a value of 7. % (Note that blended_spectrum_bf('abccc','abc',2)=blended_spectrum_bf('abc','abccc',2) since K(s,t,p) = K(t,s,p) ). % -Example: blended_spectrum_bf('a','a', 1) returns a value of 1. % -Example: blended_spectrum_bf('a','a', 2) returns a value of 4. % -Example: blended_spectrum_bf('a','b', 1) returns a value of 0. % -Example: blended_spectrum_bf('ab','ab', 1) returns a value of 2.
%------------------------------------------------------------------------------------------
%Obtain lengths of strings [num_rows_s, n] = size(s); [num_rows_t, m] = size(t);
%Initialize result variable result = 0;
%Error checking statements:
%Make sure input vectors are horizontal.
if (num_rows_s ~= 1 | num_rows_t ~= 1)
error('Error: s and t must be horizontal vectors.');
end;
%If p is less than zero or not a number, program should quit due to faulty variable input. if p <= 0 | ischar(p) error('Error: p needs to be a number greater than 0.'); end; %End of error checking
%Implement the algorithm with double 'for' loops.
for h=1:p
for i=1:(n-h+1)
for j=1:(m-h+1)
result = result + strcmpi( s(i:(i+h-1)), t(j:(j+h-1)) );
end;
end;
end;
In [36]:
from functools32 import lru_cache
def bruteforce_blended_spectrum_kernel(s, t, p):
"""
returns the number of contiguous subsequences/substrings
between strings s and t for all substrings of length <= p.
Note: this (bruteforce) version does not consider
gap penalties (i.e. there's no lambda decay).
Examples
--------
>>> bruteforce_blended_spectrum_kernel('a', 'a', 1)
1
>>> bruteforce_blended_spectrum_kernel('a', 'b', 1)
0
>>> bruteforce_blended_spectrum_kernel('ab', 'ab', 1)
2
>>> bruteforce_blended_spectrum_kernel('abccc', 'abc', 2)
7
"""
def delta(s, t):
"""identity function."""
return 1 if s == t else 0
result = 0
for h in xrange(p):
for i in xrange(len(s)-h):
for j in xrange(len(t)-h):
result += delta(s[i:i+h+1], t[j:j+h+1])
return result
In [37]:
s = """
In mathematics, computer science, economics, and bioinformatics,
dynamic programming is a method for solving complex problems
by breaking them down into simpler subproblems."""
t = """
It is applicable to problems exhibiting the properties of
overlapping subproblems[1] and optimal substructure
(described below)."""
In [67]:
from functools32 import lru_cache
@lru_cache
def p_suffix_kernel(s, t, p, lambda_weight):
"""
evalutates the similarity of of the suffixes of the given
input strings s and t.
"""
# print "p_suffix_kernel({}, {}, {})".format(s, t, p)
if p == 0:
return 0
# if s and t share a suffix of length p
if s[-p:] == t[-p:]:
# evaluate p-suffix kernel recursively on the remaining front portin of the strings
# (i.e. the 'head' in Prolog terminology or 'car' in Lisp)
return lambda_weight**2 * (1 + p_suffix_kernel(s[:-p], t[:-p], p-1, lambda_weight))
else:
return 0
In [39]:
p_suffix_kernel('verscheuchen', 'verseuchen', 3, 1)
Out[39]:
In [62]:
def blended_spectrum_kernel(s, t, p, lambda_weight=1):
"""
blended version of the p-spectrum kernel,
which considers all spectra in the range
1 <= d <= p and weights them according to lamda_weigth^d.
The calculation works just like the p-spectrum kernel,
but uses the p-suffix instead of the k-suffix kernel::
\tilde{K_{p}} =
\sum\limits^{|s|-p+1}_{i=1}
\sum\limits^{|t|-p+1}_{j=1}
\tilde{K}^{S}_{p}(s(i:i+p), t(j:j+p))
(Shawe-Taylor and Cristianini 2004, p.350f)
Paramters
---------
s : str
input string 1
t : str
input string 2
p : int
length of contiguous substrings to be found
lambda_weight : int
weight common suffixes according to their length
"""
result = 0
for i in xrange(len(s)-p+1):
for j in xrange(len(s)-p+1):
result += p_suffix_kernel(s[i:i+p], t[j:j+p], p, lambda_weight)
return result
In [44]:
print bruteforce_blended_spectrum_kernel('a', 'a', 1)
print bruteforce_blended_spectrum_kernel('a', 'b', 1)
print bruteforce_blended_spectrum_kernel('ab', 'ab', 1)
print bruteforce_blended_spectrum_kernel('abccc', 'abc', 2)
In [69]:
blended_spectrum_kernel(s, t, 2)