In [1]:
import sys
import os.path
import time # used for measurements
from matplotlib import pyplot as plt
from matplotlib import cm
import numpy as np
from random import sample
root = '/home/andreas/personal/dev/python/stringologie-ss14'
sys.path.append(root)
from stringmatching import stringmatching as sm
algs = sm.algs
In [2]:
# render matplotlib plots directly into the notebook
%matplotlib inline
In [3]:
text = ''
with open(os.path.join(root, 'resources/pi_1million.txt'), 'r') as f:
text = f.read()
pattern = '9529081197139256765089770918338138516867176390702131969107199123500774303145958784697786363208486985'
len(pattern)
passes = 10
measurements = {}
for name in algs:
alg = algs[name]
min_len = 3
measurements[name] = {}
for r in range(8):
pat_len = min_len + 2**r
measurements[name][pat_len] = {
'min': sys.maxsize,
'max': 0,
'avg': sys.maxsize,
}
t = []
print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
for _ in range(passes):
start_time = time.perf_counter()
sm.search(pattern[:pat_len], text, alg, True)
end_time = time.perf_counter()
duration = end_time - start_time
t.append(duration)
measurements[name][pat_len]['min'] = min(t)
measurements[name][pat_len]['max'] = max(t)
avg = sum(t)/len(t)
measurements[name][pat_len]['avg'] = avg
measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])
In [4]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars = []
i = 0
bars = []
for alg in m:
avg = []
std = []
pat_lengths = [ _ for _ in m[alg] ]
pat_lengths.sort()
for pat_len in pat_lengths:
avg.append(m[alg][pat_len]['avg']*1000)
std.append(m[alg][pat_len]['std']*1000)
bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
plt.ylabel('milliseconds')
plt.xlabel('pattern length')
plt.xticks(ind+width/2., pat_lengths )
i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()
In [5]:
text = ''
with open(os.path.join(root, 'resources/big.txt'), 'r') as f:
text = f.read()
In [6]:
# prepare patterns
splits = sorted([ (len(x), x) for x in text.split() ])
patterns = {}
passes = 5
for length in range(4, 3**3+1, 3):
patterns[length] = sample(set([ y for (_,y) in splits if _ == length ]), passes)
In [7]:
measurements = {}
for name in algs:
alg = algs[name]
min_len = 3
measurements[name] = {}
for pat_len in sorted( _ for _ in patterns ):
measurements[name][pat_len] = {
'min': sys.maxsize,
'max': 0,
'avg': sys.maxsize,
}
t = []
print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
# get some random patterns (all of them would use way too much time)
for pattern in patterns[pat_len]:
start_time = time.perf_counter()
sm.search(pattern, text, alg, True)
end_time = time.perf_counter()
duration = end_time - start_time
t.append(duration)
measurements[name][pat_len]['min'] = min(t)
measurements[name][pat_len]['max'] = max(t)
avg = sum(t)/len(t)
measurements[name][pat_len]['avg'] = avg
measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])
In [8]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars = []
i = 0
bars = []
for alg in m:
avg = []
std = []
pat_lengths = [ _ for _ in m[alg] ]
pat_lengths.sort()
for pat_len in pat_lengths:
avg.append(m[alg][pat_len]['avg']*1000)
std.append(m[alg][pat_len]['std']*1000)
bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
plt.ylabel('milliseconds')
plt.xlabel('pattern length')
plt.xticks(ind+width/2., pat_lengths )
i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()
In [9]:
import cProfile
text = ''
with open(os.path.join(root, 'resources/pi_10million.txt'), 'r') as f:
text = f.read()
pattern = '9529081197139256765089770918338138516867176390702131969107199123500774303145958784697786363208486985'
for name in sm.algs:
cmd = "sm.search(pattern, text, sm.algs['{}'], True)".format(name)
print(name)
cProfile.run(cmd)
In [10]:
patterns = {}
passes = 5
abc = 'abcdefhijklmnopqrstuvwxyz'
for pat_len in range(3, len(abc), 3):
patterns[pat_len] = abc[:pat_len]
measurements = {}
for name in algs:
alg = algs[name]
measurements[name] = {}
for pat_len in sorted( _ for _ in patterns ):
measurements[name][pat_len] = {
'min': sys.maxsize,
'max': 0,
'avg': sys.maxsize,
}
t = []
print('Measuring time for "{}" with pattern length {}'.format(name, pat_len))
for pattern in patterns[pat_len]:
text = pattern * 100000 + 'z'
for _ in range(passes):
start_time = time.perf_counter()
sm.search(pattern, text, alg, True)
end_time = time.perf_counter()
duration = end_time - start_time
t.append(duration)
measurements[name][pat_len]['min'] = min(t)
measurements[name][pat_len]['max'] = max(t)
avg = sum(t)/len(t)
measurements[name][pat_len]['avg'] = avg
measurements[name][pat_len]['std'] = sum([ (_ - avg)**2 for _ in t ])
In [11]:
m = measurements
N = len(m['naive'])
ind = np.arange(N)
width = 1/len(m)
bars = []
i = 0
bars = []
for alg in m:
avg = []
std = []
pat_lengths = [ _ for _ in m[alg] ]
pat_lengths.sort()
for pat_len in pat_lengths:
avg.append(m[alg][pat_len]['avg']*1000)
std.append(m[alg][pat_len]['std']*1000)
bars.append(plt.bar(ind+i*width, tuple(avg), width, yerr=tuple(std), color=cm.Set1(i/N, 1)))
plt.ylabel('milliseconds')
plt.xlabel('pattern length')
plt.xticks(ind+width/2., pat_lengths )
i+=1
plt.legend(bars, [ _ for _ in m ], fontsize='xx-small', fancybox=True, shadow=True)
plt.show()
In [11]: