Day 12 - Z algorithm (string matching)


Definition(s)

Z algorithm is used for computing Zi(P) = the length of the longest substring of P that starts at i > 1 and matches a prefix of P.

Algorithm(s)


In [1]:
def z_algorithm(string):
    # z[0] = 0 (by convention)
    # z[i] = the longest prefix of string[0:n) and string[i:n) for 1 <= i < n
    # Complexity: O(n)
    
    l, r, n = 0, 0, len(string)
    z = [0 for i in range(n)]

    for i in range(1, n):
        if i <= r:
            z[i] = min(r - i + 1, z[i - l])

        while i + z[i] < n and string[z[i]] == string[i + z[i]]:
            z[i] += 1

        if i + z[i] - 1 > r:
            l = i
            r = i + z[i] - 1

    
    return z

In [2]:
def string_matching(pattern, text):
    p, t = len(pattern), len(text)
    z = z_algorithm(pattern + '#' + text)

    matches = []
    for i in range(p + 1, p + t + 1):
        if z[i] == p:
            matches.append(i - p - 1)

    return matches

Run(s)


In [3]:
print(string_matching('ABA', 'CABBCABABAB'))


[5, 7]

In [4]:
print(z_algorithm('abababababababababbabababa'))


[0, 0, 16, 0, 14, 0, 12, 0, 10, 0, 8, 0, 6, 0, 4, 0, 2, 0, 0, 7, 0, 5, 0, 3, 0, 1]

In [6]:
print(z_algorithm('ababa'))


[0, 0, 3, 0, 1]

In [14]:
with open('raven.txt', 'r') as file:
    raven = file.read()

In [15]:
print(string_matching('the', raven))


[160, 371, 444, 472, 562, 584, 617, 676, 821, 1060, 1140, 1317, 1338, 1410, 1504, 1534, 1566, 1580, 1597, 1671, 1732, 1919, 1930, 2033, 2080, 2133, 2166, 2196, 2481, 2512, 2658, 2713, 2749, 3036, 3116, 3145, 3248, 3253, 3280, 3285, 3342, 3378, 3444, 3485, 3727, 3806, 3945, 4213, 4336, 4369, 4434, 4514, 4612, 4668, 4678, 4703, 4735, 4799, 4840, 4940, 4965, 4985, 5126, 5137, 5198, 5402, 5459, 5523, 5555, 5660, 5675, 5691, 5823, 5923, 5952, 6017, 6093, 6140, 6195, 6261]

In [16]:
print(string_matching('our', raven))


[1109, 2822, 3232, 5594]