In [1]:
import sys # for gioia to load aiohttp
sys.path.append('/Users/maggiori/anaconda/envs/py35/lib/python3.5/site-packages')

In [2]:
# to import modules locally without having installed the entire package
# http://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

In [3]:
import signal
import time
import subprocess
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')

Time Series Database

This notebook demonstrates the persistent behavior of the database.

Initialization

  • Clear the file system for demonstration purposes.

In [4]:
# database parameters
ts_length = 100
data_dir = '../db_files'
db_name = 'default'
dir_path = data_dir + '/' + db_name + '/'

In [5]:
# clear file system for testing
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
filelist = [dir_path + f for f in os.listdir(dir_path)]
for f in filelist:
    os.remove(f)
  • Load the database server.

In [6]:
# when running from the terminal
# python go_server_persistent.py --ts_length 100 --db_name 'demo'

# here we load the server as a subprocess for demonstration purposes
server = subprocess.Popen(['python', '../go_server_persistent.py',
                           '--ts_length', str(ts_length), '--data_dir', data_dir, '--db_name', db_name])
time.sleep(5)  # make sure it loads completely
  • Load the database webserver.

In [7]:
# when running from the terminal
# python go_webserver.py

# here we load the server as a subprocess for demonstration purposes
webserver = subprocess.Popen(['python', '../go_webserver.py'])
time.sleep(5)  # make sure it loads completely
  • Import the web interface and initialize it.

In [8]:
from webserver import *


Generating LALR tables

In [9]:
web_interface = WebInterface()

Generate Data

Let's create some dummy data to aid in our demonstration. You will need to import the timeseries package to work with the TimeSeries format.

Note: the database is persistent, so can store data between sessions, but we will start with an empty database here for demonstration purposes.


In [10]:
from timeseries import *

In [11]:
def tsmaker(m, s, j):
    '''
    Helper function: randomly generates a time series for testing.

    Parameters
    ----------
    m : float
        Mean value for generating time series data
    s : float
        Standard deviation value for generating time series data
    j : float
        Quantifies the "jitter" to add to the time series data

    Returns
    -------
    A time series and associated meta data.
    '''

    # generate metadata
    meta = {}
    meta['order'] = int(np.random.choice(
        [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]))
    meta['blarg'] = int(np.random.choice([1, 2]))

    # generate time series data
    t = np.arange(0.0, 1.0, 0.01)
    v = norm.pdf(t, m, s) + j * np.random.randn(ts_length)

    # return time series and metadata
    return meta, TimeSeries(t, v)

In [12]:
# generate sample time series
num_ts = 50
mus = np.random.uniform(low=0.0, high=1.0, size=num_ts)
sigs = np.random.uniform(low=0.05, high=0.4, size=num_ts)
jits = np.random.uniform(low=0.05, high=0.2, size=num_ts)

# initialize dictionaries for time series and their metadata
primary_keys = []
tsdict = {}
metadict = {}

# fill dictionaries with randomly generated entries for database
for i, m, s, j in zip(range(num_ts), mus, sigs, jits):
    meta, tsrs = tsmaker(m, s, j)  # generate data
    pk = "ts-{}".format(i)  # generate primary key
    primary_keys.append(pk) # keep track of all primary keys
    tsdict[pk] = tsrs  # store time series data
    metadict[pk] = meta  # store metadata
    
# to assist with later testing
ts_keys = sorted(tsdict.keys())
    
# randomly choose time series as vantage points
num_vps = 5
vpkeys = list(np.random.choice(ts_keys, size=num_vps, replace=False))
vpdist = ['d_vp_{}'.format(i) for i in vpkeys]

Insert Data

Let's start by loading the data into the database, using the REST API web interface.


In [13]:
# check that the database is empty
web_interface.select()


Out[13]:
OrderedDict()

In [14]:
# add stats trigger
web_interface.add_trigger('stats', 'insert_ts', ['mean', 'std'], None)


Out[14]:
'OK'

In [15]:
# insert the time series
for k in tsdict:
    web_interface.insert_ts(k, tsdict[k])

In [16]:
# upsert the metadata
for k in tsdict:
    web_interface.upsert_meta(k, metadict[k])

In [17]:
# add the vantage points
for i in range(num_vps):
    web_interface.insert_vp(vpkeys[i])

Inspect Data

Let's inspect the data, to make sure that all the previous operations were successful.


In [18]:
# select all database entries; all metadata fields
results = web_interface.select(fields=[])

# we have the right number of database entries
assert len(results) == num_ts

# we have all the right primary keys
assert sorted(results.keys()) == ts_keys

In [19]:
# check that all the time series and metadata matches
for k in tsdict:
    results = web_interface.select(fields=['ts'], md={'pk': k})
    assert results[k]['ts'] == tsdict[k]
    results = web_interface.select(fields=[], md={'pk': k})
    for field in metadict[k]:
        assert metadict[k][field] == results[k][field]

In [20]:
# check that the vantage points match
print('Vantage points selected:', vpkeys)
print('Vantage points in database:',
      web_interface.select(fields=None, md={'vp': True}, additional={'sort_by': '+pk'}).keys())


Vantage points selected: ['ts-25', 'ts-18', 'ts-3', 'ts-16', 'ts-49']
Vantage points in database: odict_keys(['ts-16', 'ts-18', 'ts-25', 'ts-3', 'ts-49'])

In [21]:
# check that the vantage point distance fields have been created
print('Vantage point distance fields:', vpdist)
web_interface.select(fields=vpdist, additional={'sort_by': '+pk', 'limit': 1})


Vantage point distance fields: ['d_vp_ts-25', 'd_vp_ts-18', 'd_vp_ts-3', 'd_vp_ts-16', 'd_vp_ts-49']
Out[21]:
OrderedDict([('ts-0',
              OrderedDict([('d_vp_ts-3', 0.7125162482261658),
                           ('d_vp_ts-25', 0.5719413757324219),
                           ('d_vp_ts-49', 0.542866051197052),
                           ('d_vp_ts-18', 0.3780462443828583),
                           ('d_vp_ts-16', 0.8789393901824951)]))])

In [22]:
# check that the trigger has executed as expected (allowing for rounding errors)
for k in tsdict:
    results = web_interface.select(fields=['mean', 'std'], md={'pk': k})
    assert np.round(results[k]['mean'], 4) == np.round(tsdict[k].mean(), 4)
    assert np.round(results[k]['std'], 4) == np.round(tsdict[k].std(), 4)

Let's generate an additional time series for similarity searches. We'll store the time series and the results of the similarity searches, so that we can compare against them after reloading the database.


In [23]:
_, query = tsmaker(np.random.uniform(low=0.0, high=1.0),
                   np.random.uniform(low=0.05, high=0.4),
                   np.random.uniform(low=0.05, high=0.2))

In [24]:
results_vp = web_interface.vp_similarity_search(query, 1)
results_vp


Out[24]:
OrderedDict([('ts-15', 0.2561592566535857)])

In [25]:
results_isax = web_interface.isax_similarity_search(query)
results_isax


Out[25]:
OrderedDict([('ts-15', 1.3528808283518743)])

Finally, let's store our iSAX tree representation.


In [26]:
results_tree = web_interface.isax_tree()
print(results_tree)


root
--->['10', '00', '01', '11']: 4 ['ts-12', 'ts-36', 'ts-37', 'ts-41']
--->['01', '00', '10', '11']: 3 ['ts-14', 'ts-17', 'ts-28']
--->['11', '10', '00', '01']: 3 ['ts-26', 'ts-31', 'ts-5']
--->['11', '10', '01', '00']: 2 ['ts-20', 'ts-42']
--->['11', '00', '00', '11']: 1 ['ts-0']
--->['00', '01', '11', '11']: 5 ['ts-13', 'ts-19', 'ts-22', 'ts-3', 'ts-44']
--->['11', '10', '00', '00']: 4 ['ts-10', 'ts-4', 'ts-45', 'ts-49']
--->['11', '01', '00', '01']: 3 ['ts-18', 'ts-2', 'ts-47']
--->['00', '00', '10', '11']: 3 ['ts-25', 'ts-8', 'ts-9']
--->['10', '00', '10', '11']: 3 ['ts-11', 'ts-16', 'ts-23']
--->['10', '00', '00', '11']: 1 ['ts-24']
--->['00', '01', '10', '11']: 2 ['ts-15', 'ts-27']
--->['11', '11', '01', '00']: 3 ['ts-1', 'ts-29', 'ts-33']
--->['11', '11', '00', '00']: 1 ['ts-40']
--->['01', '00', '01', '11']: 2 ['ts-38', 'ts-43']
--->['11', '01', '00', '10']: 2 ['ts-32', 'ts-46']
--->['00', '01', '10', '10']: 1 ['ts-21']
--->['00', '10', '10', '10']: 2 ['ts-39', 'ts-48']
--->['10', '10', '00', '10']: 1 ['ts-34']
--->['11', '10', '00', '10']: 2 ['ts-30', 'ts-6']
--->['00', '10', '11', '11']: 1 ['ts-7']
--->['10', '10', '10', '00']: 1 ['ts-35']

Terminate and Reload Database

Now that we know that everything is loaded, let's close the database and re-open it.


In [27]:
os.kill(server.pid, signal.SIGINT)
time.sleep(5)  # give it time to terminate
os.kill(webserver.pid, signal.SIGINT)
time.sleep(5)  # give it time to terminate
web_interface = None

In [28]:
server = subprocess.Popen(['python', '../go_server_persistent.py',
                           '--ts_length', str(ts_length), '--data_dir', data_dir, '--db_name', db_name])
time.sleep(5)  # give it time to load fully
webserver = subprocess.Popen(['python', '../go_webserver.py'])
time.sleep(5)  # give it time to load fully
web_interface = WebInterface()

Inspect Data

Let's repeat the previous tests to check whether our persistence architecture worked.


In [29]:
# select all database entries; all metadata fields
results = web_interface.select(fields=[])

# we have the right number of database entries
assert len(results) == num_ts

# we have all the right primary keys
assert sorted(results.keys()) == ts_keys

In [30]:
# check that all the time series and metadata matches
for k in tsdict:
    results = web_interface.select(fields=['ts'], md={'pk': k})
    assert results[k]['ts'] == tsdict[k]
    results = web_interface.select(fields=[], md={'pk': k})
    for field in metadict[k]:
        assert metadict[k][field] == results[k][field]

In [31]:
# check that the vantage points match
print('Vantage points selected:', vpkeys)
print('Vantage points in database:',
      web_interface.select(fields=None, md={'vp': True}, additional={'sort_by': '+pk'}).keys())


Vantage points selected: ['ts-25', 'ts-18', 'ts-3', 'ts-16', 'ts-49']
Vantage points in database: odict_keys(['ts-16', 'ts-18', 'ts-25', 'ts-3', 'ts-49'])

In [32]:
# check that isax tree has fully reloaded
print(web_interface.isax_tree())


root
--->['11', '10', '00', '01']: 3 ['ts-26', 'ts-31', 'ts-5']
--->['01', '00', '10', '11']: 3 ['ts-14', 'ts-17', 'ts-28']
--->['11', '11', '01', '00']: 3 ['ts-1', 'ts-29', 'ts-33']
--->['10', '00', '01', '11']: 4 ['ts-12', 'ts-36', 'ts-37', 'ts-41']
--->['00', '01', '10', '11']: 2 ['ts-15', 'ts-27']
--->['01', '00', '01', '11']: 2 ['ts-38', 'ts-43']
--->['11', '01', '00', '10']: 2 ['ts-32', 'ts-46']
--->['11', '01', '00', '01']: 3 ['ts-18', 'ts-2', 'ts-47']
--->['11', '11', '00', '00']: 1 ['ts-40']
--->['10', '00', '10', '11']: 3 ['ts-11', 'ts-16', 'ts-23']
--->['11', '10', '00', '10']: 2 ['ts-30', 'ts-6']
--->['00', '01', '11', '11']: 5 ['ts-13', 'ts-19', 'ts-22', 'ts-3', 'ts-44']
--->['10', '00', '00', '11']: 1 ['ts-24']
--->['00', '00', '10', '11']: 3 ['ts-25', 'ts-8', 'ts-9']
--->['11', '10', '00', '00']: 4 ['ts-10', 'ts-4', 'ts-45', 'ts-49']
--->['10', '10', '00', '10']: 1 ['ts-34']
--->['11', '00', '00', '11']: 1 ['ts-0']
--->['00', '10', '10', '10']: 2 ['ts-39', 'ts-48']
--->['00', '01', '10', '10']: 1 ['ts-21']
--->['11', '10', '01', '00']: 2 ['ts-20', 'ts-42']
--->['10', '10', '10', '00']: 1 ['ts-35']
--->['00', '10', '11', '11']: 1 ['ts-7']


In [33]:
# compare vantage point search results
results_vp == web_interface.vp_similarity_search(query, 1)


Out[33]:
True

In [34]:
# compare isax search results
results_isax == web_interface.isax_similarity_search(query)


Out[34]:
True

In [35]:
# check that the trigger is still there by loading new data

# create test time series
_, test = tsmaker(np.random.uniform(low=0.0, high=1.0),
                  np.random.uniform(low=0.05, high=0.4),
                  np.random.uniform(low=0.05, high=0.2))

# insert test time series
web_interface.insert_ts('test', test)

# check that mean and standard deviation have been calculated
print(web_interface.select(fields=['mean', 'std'], md={'pk': 'test'}))

# remove test time series
web_interface.delete_ts('test');


OrderedDict([('test', OrderedDict([('mean', 1.003503441810608), ('std', 0.725745439529419)]))])

We have successfully reloaded all of the database components from disk!


In [36]:
# terminate processes before exiting
os.kill(server.pid, signal.SIGINT)
time.sleep(5)  # give it time to terminate
web_interface = None
webserver.terminate()