In [5]:
from vp_tree.simple_vp import *
from timeseries.TimeSeries import TimeSeries
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from urllib.request import urlopen
%matplotlib inline
In [6]:
sys.path.append('/Users/Elena/Desktop/TimeSeries/')
In [52]:
def find_similar_pt():
rn = lambda: random.randint(0, 10000)
aset = [(rn(), rn()) for i in range(40000)]
q = (rn(), rn())
rad = 9990
distance = lambda a, b: math.sqrt(sum([((x-y)**2) for x, y in zip(a, b)]))
s = time.time()
print("creating vptree...")
root = VpNode(aset, distance=distance)
print("vptree created", time.time() - s)
s = time.time()
print("searching...")
se = VpSearch(root, q, rad, 30)
#out = se.search()
out = se.knn()
for k, v in sorted(se.stat.items()):
print(k, v)
print("number of resultes: %s" % len(out))
print("vptree search done, searching time", time.time() - s)
projx = lambda x: map(lambda y: y[0], x)
projy = lambda x: map(lambda y: y[1], x)
fig, ax = plt.subplots(nrows=1, ncols=2)
ax[0].scatter(list(projx(aset)), list(projy(aset)), s = 20, alpha=0.1)
ax[0].scatter([q[0]], [q[1]], s = 40, color='g')
ax[0].scatter(list(projx(out)), list(projy(out)), s = 10, color='r')
ax[0].annotate("query", xy=q)
ax[1].scatter([q[0]], [q[1]], s = 40, color='g')
ax[1].scatter(list(projx(out)), list(projy(out)), s = 10, color='r')
plt.show()
here we find the top 30 closest points to objective point in a set of 40000 tuples. The graph below shows
In [27]:
find_similar_pt()
In [ ]:
In [40]:
def tsmaker(m, s, j):
"returns metadata and a time series in the shape of a jittered normal"
t = np.arange(0.0, 1.0, 0.01)
v = norm.pdf(t, m, s) + j*np.random.randn(100)
return (t, v)
In [53]:
mus = np.random.uniform(low=0.0, high=1.0, size=50)
sigs = np.random.uniform(low=0.05, high=0.4, size=50)
jits = np.random.uniform(low=0.05, high=0.2, size=50)
In [54]:
ts_set = [tsmaker(m, s, j) for i, m, s, j in zip(range(50), mus, sigs, jits)]
In [55]:
ts_set[0][1]
Out[55]:
In [50]:
def find_similar_ts():
rn = lambda: random.randint(0, 10000)
aset = [tsmaker(m, s, j) for i, m, s, j in zip(range(50), mus, sigs, jits)]
q = tsmaker(mus[1], sigs[1], jits[1])
rad = 9990
distance = lambda a, b: math.sqrt(sum([((x-y)**2) for x, y in zip(a[1], b[1])]))
s = time.time()
print("creating vptree...")
root = VpNode(aset, distance=distance)
print("vptree created", time.time() - s)
s = time.time()
print("searching...")
se = VpSearch(root, q, rad, 5)
#out = se.search()
out = se.knn()
for k, v in sorted(se.stat.items()):
print(k, v)
print("number of resultes: %s s" % len(out))
print("vptree search done", time.time() - s)
plt.plot(q[1], label='original timeseries', linewidth=2)
plt.plot(out[0][1], label='similar_1')
plt.plot(out[1][1], label='similar_2')
plt.plot(out[2][1], label='similar_3')
plt.legend()
plt.show()
In [51]:
find_similar_ts()
In [46]:
find_similar_ts()
In [56]:
find_similar_ts()
In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. It is named after Vladimir Levenshtein, who considered this distance in 1965.[1]
In [57]:
def levenshtein(a,b):
"Calculates the Levenshtein distance between a and b."
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a,b = b,a
n,m = m,n
current = range(n+1)
for i in range(1,m+1):
previous, current = current, [i]+[0]*n
for j in range(1,n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1] != b[i-1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
def find_similar_ts(file_name):
f = open(file_name)
next(f)
aset = [w[:-1] for w in f]
rad = 1
distance = levenshtein
s = time.time()
print("\ninput set %s points" % len(aset))
print("creating tree...")
root = VpNode(aset, distance=distance)
print("created: %s nodes" % VpNode.ids)
print("done in s: %s" % (time.time() - s))
print("searching...")
while True:
q = input(">>")
s = time.time()
se = VpSearch(root, q, rad, 10)
out = se.knn()
print(se.stat)
print("founded %s results:" % len(out))
count = 1
print("\n".join(out))
print("done in s: %s" % (time.time() - s))
Note: Since the word dictionary is really large, the below function may take over 10 mins to run:
In [ ]:
find_similar_ts('wordsEn.txt')
In [ ]: