In [1]:
# Not a great way to build a suffix array, but we'll use it
# for the small examples here
def naiveBuildSA(t):
satups = sorted([(t[i:], i) for i in range(len(t))])
return list(map(lambda x: x[1], satups))
In [2]:
naiveBuildSA('abaaba$') # works on a simple example
Out[2]:
In [3]:
def binarySearchSA(t, sa, p):
assert t[-1] == '$' # t already has terminator
assert len(t) == len(sa) # sa is the suffix array for t
if len(t) == 1: return 1
l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]
while True:
c = (l + r) // 2
# determine whether p < T[sa[c]:] by doing comparisons
# starting from left-hand sides of p and T[sa[c]:]
plt = True # assume p < T[sa[c]:] until proven otherwise
i = 0
while i < len(p) and sa[c]+i < len(t):
if p[i] < t[sa[c]+i]:
break # p < T[sa[c]:]
elif p[i] > t[sa[c]+i]:
plt = False
break # p > T[sa[c]:]
i += 1 # tied so far
if plt:
if c == l + 1: return c
r = c
else:
if c == r - 1: return r
l = c
In [4]:
t = 'abaaba$'
sa = naiveBuildSA(t)
binarySearchSA(t, sa, 'aba')
Out[4]:
In [5]:
binarySearchSA(t, sa, 'bb') # p is greater than all suffixes
Out[5]:
In [6]:
binarySearchSA(t, sa, 'aa')
Out[6]:
In [7]:
def suffixLcp(t, toff, p):
i = 0
while i < len(p) and i + toff < len(t):
if p[i] != t[i + toff]:
return i
i += 1
return i
In [8]:
suffixLcp('abaaba$', 0, 'aba')
Out[8]:
In [9]:
suffixLcp('abaaba$', 0, 'abab')
Out[9]:
In [10]:
suffixLcp('abaaba$', 0, 'abaabaaba')
Out[10]:
In [11]:
def binarySearchSA_lcp1(t, sa, p):
assert t[-1] == '$' # t already has terminator
assert len(t) == len(sa) # sa is the suffix array for t
if len(t) == 1: return 1
l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]
lcp_lp, lcp_rp = 0, 0
while True:
c = (l + r) // 2
# determine whether p < T[sa[c]:] by doing comparisons
# starting from left-hand sides of p and T[sa[c]:]
plt = True # assume p < T[sa[c]:] until proven otherwise
i = min(lcp_lp, lcp_rp)
while i < len(p) and sa[c]+i < len(t):
if p[i] < t[sa[c]+i]:
break # p < T[sa[c]:]
elif p[i] > t[sa[c]+i]:
plt = False
break # p > T[sa[c]:]
i += 1 # tied so far
if plt:
if c == l + 1: return c
r = c
lcp_rp = i
else:
if c == r - 1: return r
l = c
lcp_lp = i
In [12]:
binarySearchSA_lcp1(t, sa, 'aba')
Out[12]:
In [13]:
binarySearchSA_lcp1(t, sa, 'bb')
Out[13]:
In [14]:
binarySearchSA_lcp1(t, sa, 'aa')
Out[14]: