In [1]:
def z(s):
''' Use Z-algorithm to preprocess given string. See
Gusfield for complete description of algorithm. '''
Z = [len(s)] + [0] * len(s)
assert len(s) > 1
# Initial comparison of s[1:] with prefix
for i in range(1, len(s)):
if s[i] == s[i-1]:
Z[1] += 1
else:
break
r, l = 0, 0
if Z[1] > 0:
r, l = Z[1], 1
for k in range(2, len(s)):
assert Z[k] == 0
if k > r:
# Case 1
for i in range(k, len(s)):
if s[i] == s[i-k]:
Z[k] += 1
else:
break
r, l = k + Z[k] - 1, k
else:
# Case 2
# Calculate length of beta
nbeta = r - k + 1
Zkp = Z[k - l]
if nbeta > Zkp:
# Case 2a: Zkp wins
Z[k] = Zkp
else:
# Case 2b: Compare characters just past r
nmatch = 0
for i in range(r+1, len(s)):
if s[i] == s[i - k]:
nmatch += 1
else:
break
l, r = k, r + nmatch
Z[k] = r - k + 1
return Z
In [2]:
z('abracadabra')
# abracadabra (11)
# a (1)
# a (1)
# abra (4)
# a (1)
Out[2]:
In [3]:
z('aaaaa')
Out[3]:
In [4]:
def zMatch(p, t):
s = p + "$" + t
Z = z(s)
occurrences = []
for i in range(len(p) + 1, len(s)):
if Z[i] >= len(p):
occurrences.append(i - (len(p) + 1))
return occurrences
In [5]:
zMatch('needle', 'haystack needle haystack')
# 012345678901234567890123
# 1 2
Out[5]:
In [6]:
zMatch('needle', 'haystack needle needle')
# 0123456789012345678901
# 1 2
Out[6]: