Runtime measurements for some string matching algorithms


In [1]:
import sys
import os.path
import time    # used for measurements
from matplotlib import pyplot as plt
from matplotlib import cm
import numpy as np
from random import sample

root = '/home/andreas/personal/dev/python/stringologie-ss14'
sys.path.append(root)

from stringmatching import stringmatching as sm

algs = sm.algs

In [2]:
# render matplotlib plots directly into the notebook
%matplotlib inline

Time measurement for pattern search in million digits of π


In [3]:
text = ''
with open(os.path.join(root, 'resources/pi_1million.txt'), 'r') as f:
    text = f.read()
    
pattern = '9529081197139256765089770918338138516867176390702131969107199123500774303145958784697786363208486985'
len(pattern)
passes = 10

measurements = {}
for name in algs:
    alg = algs[name]
    min_len = 3
    measurements[name] = {}
    for r in range(8):
        pat_len = min_len + 2**r
        measurements[name][pat_len] = {
            'min': sys.maxsize,
            'max': 0,
            'avg': sys.maxsize,
        }
        t = []
        print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
        for _ in range(passes):
            start_time = time.perf_counter()
            sm.search(pattern[:pat_len], text, alg, True)
            end_time = time.perf_counter()
            duration = end_time - start_time
            t.append(duration)
        measurements[name][pat_len]['min'] = min(t)
        measurements[name][pat_len]['max'] = max(t)
        avg = sum(t)/len(t)
        measurements[name][pat_len]['avg'] = avg
        measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])


Measuring time for "naive" with pattern length 4
Measuring time for "naive" with pattern length 5
Measuring time for "naive" with pattern length 7
Measuring time for "naive" with pattern length 11
Measuring time for "naive" with pattern length 19
Measuring time for "naive" with pattern length 35
Measuring time for "naive" with pattern length 67
Measuring time for "naive" with pattern length 131
Measuring time for "last_occ" with pattern length 4
Measuring time for "last_occ" with pattern length 5
Measuring time for "last_occ" with pattern length 7
Measuring time for "last_occ" with pattern length 11
Measuring time for "last_occ" with pattern length 19
Measuring time for "last_occ" with pattern length 35
Measuring time for "last_occ" with pattern length 67
Measuring time for "last_occ" with pattern length 131
Measuring time for "boyer_moore_galil" with pattern length 4
Measuring time for "boyer_moore_galil" with pattern length 5
Measuring time for "boyer_moore_galil" with pattern length 7
Measuring time for "boyer_moore_galil" with pattern length 11
Measuring time for "boyer_moore_galil" with pattern length 19
Measuring time for "boyer_moore_galil" with pattern length 35
Measuring time for "boyer_moore_galil" with pattern length 67
Measuring time for "boyer_moore_galil" with pattern length 131
Measuring time for "boyer_moore" with pattern length 4
Measuring time for "boyer_moore" with pattern length 5
Measuring time for "boyer_moore" with pattern length 7
Measuring time for "boyer_moore" with pattern length 11
Measuring time for "boyer_moore" with pattern length 19
Measuring time for "boyer_moore" with pattern length 35
Measuring time for "boyer_moore" with pattern length 67
Measuring time for "boyer_moore" with pattern length 131
Measuring time for "knuth_morris_pratt" with pattern length 4
Measuring time for "knuth_morris_pratt" with pattern length 5
Measuring time for "knuth_morris_pratt" with pattern length 7
Measuring time for "knuth_morris_pratt" with pattern length 11
Measuring time for "knuth_morris_pratt" with pattern length 19
Measuring time for "knuth_morris_pratt" with pattern length 35
Measuring time for "knuth_morris_pratt" with pattern length 67
Measuring time for "knuth_morris_pratt" with pattern length 131

In [4]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars =  []
i = 0
bars = []
for alg in m:
    avg = []
    std = []
    pat_lengths = [ _ for _ in m[alg] ]
    pat_lengths.sort()
    for pat_len in pat_lengths:
        avg.append(m[alg][pat_len]['avg']*1000)
        std.append(m[alg][pat_len]['std']*1000)
    bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
    plt.ylabel('milliseconds')
    plt.xlabel('pattern length')
    plt.xticks(ind+width/2., pat_lengths )
    i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()


Time measurement for pattern serach in a big text file containing natural language


In [5]:
text = ''
with open(os.path.join(root, 'resources/big.txt'), 'r') as f:
    text = f.read()

In [6]:
# prepare patterns
splits = sorted([ (len(x), x) for x in text.split() ])
patterns = {}
passes = 5
for length in range(4, 3**3+1, 3):
    patterns[length] = sample(set([ y for (_,y) in splits if _ == length ]), passes)

In [7]:
measurements = {}
for name in algs:
    alg = algs[name]
    min_len = 3
    measurements[name] = {}
    for pat_len in sorted( _ for _ in patterns ):
        measurements[name][pat_len] = {
            'min': sys.maxsize,
            'max': 0,
            'avg': sys.maxsize,
        }
        t = []
        print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
        # get some random patterns (all of them would use way too much time)
        for pattern in patterns[pat_len]:
            start_time = time.perf_counter()
            sm.search(pattern, text, alg, True)
            end_time = time.perf_counter()
            duration = end_time - start_time
            t.append(duration)
        measurements[name][pat_len]['min'] = min(t)
        measurements[name][pat_len]['max'] = max(t)
        avg = sum(t)/len(t)
        measurements[name][pat_len]['avg'] = avg
        measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])


Measuring time for "naive" with pattern length 4
Measuring time for "naive" with pattern length 7
Measuring time for "naive" with pattern length 10
Measuring time for "naive" with pattern length 13
Measuring time for "naive" with pattern length 16
Measuring time for "naive" with pattern length 19
Measuring time for "naive" with pattern length 22
Measuring time for "naive" with pattern length 25
Measuring time for "last_occ" with pattern length 4
Measuring time for "last_occ" with pattern length 7
Measuring time for "last_occ" with pattern length 10
Measuring time for "last_occ" with pattern length 13
Measuring time for "last_occ" with pattern length 16
Measuring time for "last_occ" with pattern length 19
Measuring time for "last_occ" with pattern length 22
Measuring time for "last_occ" with pattern length 25
Measuring time for "boyer_moore_galil" with pattern length 4
Measuring time for "boyer_moore_galil" with pattern length 7
Measuring time for "boyer_moore_galil" with pattern length 10
Measuring time for "boyer_moore_galil" with pattern length 13
Measuring time for "boyer_moore_galil" with pattern length 16
Measuring time for "boyer_moore_galil" with pattern length 19
Measuring time for "boyer_moore_galil" with pattern length 22
Measuring time for "boyer_moore_galil" with pattern length 25
Measuring time for "boyer_moore" with pattern length 4
Measuring time for "boyer_moore" with pattern length 7
Measuring time for "boyer_moore" with pattern length 10
Measuring time for "boyer_moore" with pattern length 13
Measuring time for "boyer_moore" with pattern length 16
Measuring time for "boyer_moore" with pattern length 19
Measuring time for "boyer_moore" with pattern length 22
Measuring time for "boyer_moore" with pattern length 25
Measuring time for "knuth_morris_pratt" with pattern length 4
Measuring time for "knuth_morris_pratt" with pattern length 7
Measuring time for "knuth_morris_pratt" with pattern length 10
Measuring time for "knuth_morris_pratt" with pattern length 13
Measuring time for "knuth_morris_pratt" with pattern length 16
Measuring time for "knuth_morris_pratt" with pattern length 19
Measuring time for "knuth_morris_pratt" with pattern length 22
Measuring time for "knuth_morris_pratt" with pattern length 25

In [8]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars =  []
i = 0
bars = []
for alg in m:
    avg = []
    std = []
    pat_lengths = [ _ for _ in m[alg] ]
    pat_lengths.sort()
    for pat_len in pat_lengths:
        avg.append(m[alg][pat_len]['avg']*1000)
        std.append(m[alg][pat_len]['std']*1000)
    bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
    plt.ylabel('milliseconds')
    plt.xlabel('pattern length')
    plt.xticks(ind+width/2., pat_lengths )
    i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()



In [9]:
import cProfile
text = ''
with open(os.path.join(root, 'resources/pi_10million.txt'), 'r') as f:
    text = f.read()
    
pattern = '9529081197139256765089770918338138516867176390702131969107199123500774303145958784697786363208486985'


for name in sm.algs:
    cmd = "sm.search(pattern, text, sm.algs['{}'], True)".format(name)
    print(name)
    cProfile.run(cmd)


naive
         15 function calls in 5.416 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    5.416    5.416 <string>:1(<module>)
        2    5.415    2.708    5.415    2.708 naive.py:21(__naive)
        1    0.000    0.000    5.416    5.416 naive.py:4(search)
        1    0.000    0.000    5.416    5.416 stringmatching.py:17(search)
        1    0.000    0.000    5.416    5.416 {built-in method exec}
        7    0.000    0.000    0.000    0.000 {built-in method len}
        1    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


last_occ
         14 function calls in 1.501 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.501    1.501 <string>:1(<module>)
        2    1.501    0.750    1.501    0.750 last_occ.py:25(__last_occ)
        1    0.000    0.000    0.000    0.000 last_occ.py:42(__last_occurence)
        1    0.000    0.000    1.501    1.501 last_occ.py:5(search)
        1    0.000    0.000    1.501    1.501 stringmatching.py:17(search)
        1    0.000    0.000    1.501    1.501 {built-in method exec}
        5    0.000    0.000    0.000    0.000 {built-in method len}
        1    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


boyer_moore_galil
         1098397 function calls in 1.636 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.636    1.636 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 bm.py:21(_weak_bm_shift)
        1    0.000    0.000    0.000    0.000 bm.py:34(_last_occurence)
        1    0.000    0.000    0.000    0.000 bm.py:41(_border)
        1    0.000    0.000    0.000    0.000 bm.py:5(_suffix)
        1    0.000    0.000    1.636    1.636 bm.py:56(search)
        2    1.363    0.682    1.636    0.818 bm.py:77(__bmg)
        1    0.000    0.000    1.636    1.636 stringmatching.py:17(search)
        1    0.000    0.000    1.636    1.636 {built-in method exec}
       11    0.000    0.000    0.000    0.000 {built-in method len}
  1098275    0.273    0.000    0.273    0.000 {built-in method max}
       99    0.000    0.000    0.000    0.000 {built-in method min}
        1    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


boyer_moore
         939521 function calls in 1.311 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.311    1.311 <string>:1(<module>)
        1    0.000    0.000    1.311    1.311 bm.py:101(search)
        2    1.091    0.546    1.311    0.655 bm.py:121(__bm)
        1    0.000    0.000    0.000    0.000 bm.py:21(_weak_bm_shift)
        1    0.000    0.000    0.000    0.000 bm.py:34(_last_occurence)
        1    0.000    0.000    0.000    0.000 bm.py:41(_border)
        1    0.000    0.000    0.000    0.000 bm.py:5(_suffix)
        1    0.000    0.000    1.311    1.311 stringmatching.py:17(search)
        1    0.000    0.000    1.311    1.311 {built-in method exec}
       11    0.000    0.000    0.000    0.000 {built-in method len}
   939399    0.220    0.000    0.220    0.000 {built-in method max}
       99    0.000    0.000    0.000    0.000 {built-in method min}
        1    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


knuth_morris_pratt
         122 function calls in 3.594 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.594    3.594 <string>:1(<module>)
        2    3.593    1.797    3.593    1.797 kmp.py:26(__kmp)
        1    0.000    0.000    3.594    3.594 kmp.py:4(search)
        2    0.000    0.000    0.000    0.000 kmp.py:47(compute_prefix)
        1    0.000    0.000    0.000    0.000 kmp.py:83(__strong_border)
        1    0.000    0.000    3.594    3.594 stringmatching.py:17(search)
        1    0.000    0.000    3.594    3.594 {built-in method exec}
       11    0.000    0.000    0.000    0.000 {built-in method len}
      101    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


Search for periodic patterns in a periodic text


In [10]:
patterns = {}
passes = 5
abc = 'abcdefhijklmnopqrstuvwxyz'
for pat_len in range(3, len(abc), 3):
    patterns[pat_len] = abc[:pat_len]

measurements = {}
for name in algs:
    alg = algs[name]
    measurements[name] = {}
    for pat_len in sorted( _ for _ in patterns ):
        measurements[name][pat_len] = {
            'min': sys.maxsize,
            'max': 0,
            'avg': sys.maxsize,
        }
        t = []
        print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
        for pattern in patterns[pat_len]:
            text = pattern * 100000 + 'z'
            for _ in range(passes):
                start_time = time.perf_counter()
                sm.search(pattern, text, alg, True)
                end_time = time.perf_counter()
                duration = end_time - start_time
                t.append(duration)
        measurements[name][pat_len]['min'] = min(t)
        measurements[name][pat_len]['max'] = max(t)
        avg = sum(t)/len(t)
        measurements[name][pat_len]['avg'] = avg
        measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])


Measuring time for "naive" with pattern length 3
Measuring time for "naive" with pattern length 6
Measuring time for "naive" with pattern length 9
Measuring time for "naive" with pattern length 12
Measuring time for "naive" with pattern length 15
Measuring time for "naive" with pattern length 18
Measuring time for "naive" with pattern length 21
Measuring time for "naive" with pattern length 24
Measuring time for "last_occ" with pattern length 3
Measuring time for "last_occ" with pattern length 6
Measuring time for "last_occ" with pattern length 9
Measuring time for "last_occ" with pattern length 12
Measuring time for "last_occ" with pattern length 15
Measuring time for "last_occ" with pattern length 18
Measuring time for "last_occ" with pattern length 21
Measuring time for "last_occ" with pattern length 24
Measuring time for "boyer_moore_galil" with pattern length 3
Measuring time for "boyer_moore_galil" with pattern length 6
Measuring time for "boyer_moore_galil" with pattern length 9
Measuring time for "boyer_moore_galil" with pattern length 12
Measuring time for "boyer_moore_galil" with pattern length 15
Measuring time for "boyer_moore_galil" with pattern length 18
Measuring time for "boyer_moore_galil" with pattern length 21
Measuring time for "boyer_moore_galil" with pattern length 24
Measuring time for "boyer_moore" with pattern length 3
Measuring time for "boyer_moore" with pattern length 6
Measuring time for "boyer_moore" with pattern length 9
Measuring time for "boyer_moore" with pattern length 12
Measuring time for "boyer_moore" with pattern length 15
Measuring time for "boyer_moore" with pattern length 18
Measuring time for "boyer_moore" with pattern length 21
Measuring time for "boyer_moore" with pattern length 24
Measuring time for "knuth_morris_pratt" with pattern length 3
Measuring time for "knuth_morris_pratt" with pattern length 6
Measuring time for "knuth_morris_pratt" with pattern length 9
Measuring time for "knuth_morris_pratt" with pattern length 12
Measuring time for "knuth_morris_pratt" with pattern length 15
Measuring time for "knuth_morris_pratt" with pattern length 18
Measuring time for "knuth_morris_pratt" with pattern length 21
Measuring time for "knuth_morris_pratt" with pattern length 24

In [11]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars =  []
i = 0
bars = []
for alg in m:
    avg = []
    std = []
    pat_lengths = [ _ for _ in m[alg] ]
    pat_lengths.sort()
    for pat_len in pat_lengths:
        avg.append(m[alg][pat_len]['avg']*1000)
        std.append(m[alg][pat_len]['std']*1000)
    bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
    plt.ylabel('milliseconds')
    plt.xlabel('pattern length')
    plt.xticks(ind+width/2., pat_lengths )
    i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()



In [11]: