The goal here is to test an exponential duration instead of a [0-1]^2


In [1]:
from IPython.display import HTML
HTML('''<h1>Table of Contents</h1>

<ol>
    <li><a href='#Comparing-real-and-dummy-delays'>Comparison of delays between real and dummies</a></li>
</ol>
<hr/>''')





In [2]:
%pylab inline

import sys, os
# Add parent path
sys.path.append(os.getcwd()[:-len('/notebooks')])


Populating the interactive namespace from numpy and matplotlib

In [3]:
# Large plots
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 16, 9

In [4]:
import numpy as np
import pandas as pd

In [5]:
from hubbub.generator.generator import Simulator
from hubbub.generator.heartbeat import HeartBeatSimulator
from hubbub.datasets.simulations import simple_log, gauss_log, SIMPLE_LOG as SIMPLE_LOG_SAMPLE

In [6]:
# Let's monkey patch the delay() function of the simulator with an exponential: 

def delay_expo(instance):
    return instance.period * 1 * -log(random.random())
    #return 0.5
    #return instance.period * 2 * (exp(random.random())-1)
    #return instance.period * 2 * random()**2

HeartBeatSimulator.delay = delay_expo

In [7]:
#SIMPLE_LOG = SIMPLE_LOG_SAMPLE
SIMPLE_LOG_SAMPLE


Out[7]:
[(datetime.datetime(2000, 1, 1, 0, 1, 1, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 9, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 2, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 4, 8, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 5, 3, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 6, 7, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 7, 4, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 8, 6, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 9, 5, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 10, 0, 123456), 10)]

In [8]:
# Generating "real" messages dataset:
SIMPLE_LOG = simple_log(n=2000)
#SIMPLE_LOG = gauss_log(n=200)
SIMPLE_LOG[:10]


Out[8]:
[(datetime.datetime(2000, 1, 1, 0, 0, 27, 259110), 10),
 (datetime.datetime(2000, 1, 1, 0, 1, 21, 300142), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 25, 673021), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 31, 421483), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 37, 179698), 10),
 (datetime.datetime(2000, 1, 1, 0, 4, 30, 901837), 10),
 (datetime.datetime(2000, 1, 1, 0, 4, 40, 574020), 10),
 (datetime.datetime(2000, 1, 1, 0, 5, 20, 727957), 10),
 (datetime.datetime(2000, 1, 1, 0, 6, 34, 384024), 10),
 (datetime.datetime(2000, 1, 1, 0, 6, 48, 710755), 10)]

Running simulator


In [9]:
HeartBeatSimulator.period = 0.5
result_sm = Simulator(SIMPLE_LOG).run()
results_HB = [
    HeartBeatSimulator(SIMPLE_LOG).run() for i in range(5)
#    HeartBeatSimulator(SIMPLE_LOG).run(delay=lambda: 5) for i in xrange(10)
    ]

results_HB[0][:10]


Out[9]:
[(datetime.datetime(2000, 1, 1, 0, 0), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 0, 791511), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 1, 434514), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 4, 288828), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 7, 335546), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 9, 175393), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 9, 966098), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 11, 302504), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 12, 857891), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 14, 165319), 10)]

Converting results to timestamps for plotting:


In [10]:
import time
def timestamp(n):
    unix_time = time.mktime(n.timetuple()) + n.microsecond/1000000.
    return unix_time

In [11]:
r_real = pd.DataFrame(
    [(0, timestamp(i[0]), 'SIMPLE_LOG') for i in SIMPLE_LOG],
    columns=('dummy', 'timestamp', 'source'),
    )
r_real


Out[11]:
dummy timestamp source
0 0 9.466812e+08 SIMPLE_LOG
1 0 9.466813e+08 SIMPLE_LOG
2 0 9.466813e+08 SIMPLE_LOG
3 0 9.466814e+08 SIMPLE_LOG
4 0 9.466814e+08 SIMPLE_LOG
5 0 9.466815e+08 SIMPLE_LOG
6 0 9.466815e+08 SIMPLE_LOG
7 0 9.466815e+08 SIMPLE_LOG
8 0 9.466816e+08 SIMPLE_LOG
9 0 9.466816e+08 SIMPLE_LOG
10 0 9.466816e+08 SIMPLE_LOG
11 0 9.466817e+08 SIMPLE_LOG
12 0 9.466817e+08 SIMPLE_LOG
13 0 9.466817e+08 SIMPLE_LOG
14 0 9.466818e+08 SIMPLE_LOG
15 0 9.466818e+08 SIMPLE_LOG
16 0 9.466819e+08 SIMPLE_LOG
17 0 9.466819e+08 SIMPLE_LOG
18 0 9.466821e+08 SIMPLE_LOG
19 0 9.466821e+08 SIMPLE_LOG
20 0 9.466821e+08 SIMPLE_LOG
21 0 9.466821e+08 SIMPLE_LOG
22 0 9.466821e+08 SIMPLE_LOG
23 0 9.466821e+08 SIMPLE_LOG
24 0 9.466821e+08 SIMPLE_LOG
25 0 9.466821e+08 SIMPLE_LOG
26 0 9.466822e+08 SIMPLE_LOG
27 0 9.466823e+08 SIMPLE_LOG
28 0 9.466823e+08 SIMPLE_LOG
29 0 9.466824e+08 SIMPLE_LOG
... ... ... ...
1970 0 9.467667e+08 SIMPLE_LOG
1971 0 9.467667e+08 SIMPLE_LOG
1972 0 9.467667e+08 SIMPLE_LOG
1973 0 9.467668e+08 SIMPLE_LOG
1974 0 9.467668e+08 SIMPLE_LOG
1975 0 9.467669e+08 SIMPLE_LOG
1976 0 9.467669e+08 SIMPLE_LOG
1977 0 9.467669e+08 SIMPLE_LOG
1978 0 9.467670e+08 SIMPLE_LOG
1979 0 9.467670e+08 SIMPLE_LOG
1980 0 9.467670e+08 SIMPLE_LOG
1981 0 9.467670e+08 SIMPLE_LOG
1982 0 9.467670e+08 SIMPLE_LOG
1983 0 9.467671e+08 SIMPLE_LOG
1984 0 9.467671e+08 SIMPLE_LOG
1985 0 9.467672e+08 SIMPLE_LOG
1986 0 9.467672e+08 SIMPLE_LOG
1987 0 9.467672e+08 SIMPLE_LOG
1988 0 9.467672e+08 SIMPLE_LOG
1989 0 9.467672e+08 SIMPLE_LOG
1990 0 9.467673e+08 SIMPLE_LOG
1991 0 9.467673e+08 SIMPLE_LOG
1992 0 9.467674e+08 SIMPLE_LOG
1993 0 9.467674e+08 SIMPLE_LOG
1994 0 9.467674e+08 SIMPLE_LOG
1995 0 9.467675e+08 SIMPLE_LOG
1996 0 9.467675e+08 SIMPLE_LOG
1997 0 9.467675e+08 SIMPLE_LOG
1998 0 9.467676e+08 SIMPLE_LOG
1999 0 9.467676e+08 SIMPLE_LOG

2000 rows × 3 columns


In [12]:
r_dummyHB = [
    pd.DataFrame(
        [(1, timestamp(i[0]), 'HB{}'.format(index)) for i in r],
        columns=('dummy', 'timestamp', 'source'),
        )
    for index, r in enumerate(results_HB)
    ]
r_dummyHB[0]


Out[12]:
dummy timestamp source
0 1 9.466812e+08 HB0
1 1 9.466812e+08 HB0
2 1 9.466812e+08 HB0
3 1 9.466812e+08 HB0
4 1 9.466812e+08 HB0
5 1 9.466812e+08 HB0
6 1 9.466812e+08 HB0
7 1 9.466812e+08 HB0
8 1 9.466812e+08 HB0
9 1 9.466812e+08 HB0
10 1 9.466812e+08 HB0
11 1 9.466812e+08 HB0
12 1 9.466812e+08 HB0
13 1 9.466812e+08 HB0
14 1 9.466812e+08 HB0
15 1 9.466812e+08 HB0
16 1 9.466812e+08 HB0
17 1 9.466812e+08 HB0
18 1 9.466812e+08 HB0
19 1 9.466812e+08 HB0
20 1 9.466812e+08 HB0
21 1 9.466812e+08 HB0
22 1 9.466812e+08 HB0
23 1 9.466812e+08 HB0
24 1 9.466812e+08 HB0
25 1 9.466812e+08 HB0
26 1 9.466812e+08 HB0
27 1 9.466812e+08 HB0
28 1 9.466812e+08 HB0
29 1 9.466812e+08 HB0
... ... ... ...
86035 1 9.467676e+08 HB0
86036 1 9.467676e+08 HB0
86037 1 9.467676e+08 HB0
86038 1 9.467676e+08 HB0
86039 1 9.467676e+08 HB0
86040 1 9.467676e+08 HB0
86041 1 9.467676e+08 HB0
86042 1 9.467676e+08 HB0
86043 1 9.467676e+08 HB0
86044 1 9.467676e+08 HB0
86045 1 9.467676e+08 HB0
86046 1 9.467676e+08 HB0
86047 1 9.467676e+08 HB0
86048 1 9.467676e+08 HB0
86049 1 9.467676e+08 HB0
86050 1 9.467676e+08 HB0
86051 1 9.467676e+08 HB0
86052 1 9.467676e+08 HB0
86053 1 9.467676e+08 HB0
86054 1 9.467676e+08 HB0
86055 1 9.467676e+08 HB0
86056 1 9.467676e+08 HB0
86057 1 9.467676e+08 HB0
86058 1 9.467676e+08 HB0
86059 1 9.467676e+08 HB0
86060 1 9.467676e+08 HB0
86061 1 9.467676e+08 HB0
86062 1 9.467676e+08 HB0
86063 1 9.467676e+08 HB0
86064 1 9.467676e+08 HB0

86065 rows × 3 columns

Analyzing delays

Plotting


In [13]:
r = r_dummyHB[0]
figure()
plot(r['timestamp'][:100].diff(), 'o-')
show()

figure()
for r in r_dummyHB:
    plot(r['timestamp'][:100].diff(), 'o')
show()



In [14]:
r_mixed = [
    pd.concat((r_real, r))
    for r in r_dummyHB
]
for r in r_mixed:
    r.sort('timestamp', inplace=True)
    r['before'] = r['timestamp'].diff()
    r['after'] = -r['timestamp'].diff(periods=-1)

r_mixed[0].head(10)


Out[14]:
dummy timestamp source before after
0 1 9.466812e+08 HB0 NaN 0.791511
1 1 9.466812e+08 HB0 0.791511 0.643003
2 1 9.466812e+08 HB0 0.643003 2.854314
3 1 9.466812e+08 HB0 2.854314 3.046718
4 1 9.466812e+08 HB0 3.046718 1.839847
5 1 9.466812e+08 HB0 1.839847 0.790705
6 1 9.466812e+08 HB0 0.790705 1.336406
7 1 9.466812e+08 HB0 1.336406 1.555387
8 1 9.466812e+08 HB0 1.555387 1.307428
9 1 9.466812e+08 HB0 1.307428 0.319855

Comparing real and dummy delays


In [15]:
print(r_real['timestamp'][0])
r = r_mixed[0]
plot(r['before'])
r[r['dummy'] == 0].head(10)


946681227.259
Out[15]:
dummy timestamp source before after
0 0 9.466812e+08 SIMPLE_LOG 0.267360 0.205539
1 0 9.466813e+08 SIMPLE_LOG 3.448065 0.240749
2 0 9.466813e+08 SIMPLE_LOG 0.474955 0.557656
3 0 9.466814e+08 SIMPLE_LOG 0.329274 0.204550
4 0 9.466814e+08 SIMPLE_LOG 0.335884 1.617253
5 0 9.466815e+08 SIMPLE_LOG 0.128929 0.611676
6 0 9.466815e+08 SIMPLE_LOG 0.150900 0.365202
7 0 9.466815e+08 SIMPLE_LOG 0.337730 0.079700
8 0 9.466816e+08 SIMPLE_LOG 0.106495 0.512115
9 0 9.466816e+08 SIMPLE_LOG 1.121043 0.190035

Delay before


In [16]:
# Average delay

if False:
    for r in r_mixed:
        print('all  mean', r['before'].mean())
        print('all  std ', r['before'].std())
        print('dumm mean', r[r['dummy'] == 1]['before'].mean())
        print('dumm std ', r[r['dummy'] == 1]['before'].std())
        print('real mean', r[r['dummy'] == 0]['before'].mean())
        print('real std ', r[r['dummy'] == 0]['before'].std())
        print

bar(
    range(0, len(r_mixed)*3, 3), 
    [r['before'].mean() for r in r_mixed], 
    yerr=[r['before'].std() for r in r_mixed], 
    color='g',
    )
    
bar(
    range(1, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 1]['before'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 1]['before'].std() for r in r_mixed], 
    color='y',
    )

bar(
    range(2, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 0]['before'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 0]['before'].std() for r in r_mixed], 
    color='r',
    )


Out[16]:
<Container object of 5 artists>

Delay After


In [17]:
# Average delay

if False:
    for r in r_mixed:
        print('all  mean', r['after'].mean())
        print('all  std ', r['after'].std())
        print('dumm mean', r[r['dummy'] == 1]['after'].mean())
        print('dumm std ', r[r['dummy'] == 1]['after'].std())
        print('real mean', r[r['dummy'] == 0]['after'].mean())
        print('real std ', r[r['dummy'] == 0]['after'].std())
        print

bar(
    range(0, len(r_mixed)*3, 3), 
    [r['after'].mean() for r in r_mixed], 
    yerr=[r['after'].std() for r in r_mixed], 
    color='g',
    )
    
bar(
    range(1, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 1]['after'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 1]['after'].std() for r in r_mixed], 
    color='y',
    )

bar(
    range(2, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 0]['after'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 0]['after'].std() for r in r_mixed], 
    color='r',
    )


Out[17]:
<Container object of 5 artists>

Delay over time


In [18]:
r = r_mixed[0]
figure()
plot(r['before'][:100], 'o-')
show()

figure()
for r in r_mixed:
    plot(r['before'][:100], 'o')
show()


Delay distribution (before)


In [22]:
distrib_real = r[r['dummy'] == 0]['before'].copy()
distrib_real.sort()

distrib_dummy = r[r['dummy'] == 1]['before'].copy()
distrib_dummy.sort()

figure()
title('dummy messages')
plot(distrib_dummy, '-')
show()

figure()
title('real messages')
plot(distrib_real, '-')

figure()
title('dummy + adapted real messages')
plot(distrib_dummy, '-')
mapped_x_axis = np.array(range(len(distrib_real))) * len(distrib_dummy) / float(len(distrib_real))
plot(mapped_x_axis, distrib_real, '-')

show()


Delay distribution (after)


In [20]:
distrib_real = r[r['dummy'] == 0]['after'].copy()
distrib_real.sort()

distrib_dummy = r[r['dummy'] == 1]['after'].copy()
distrib_dummy.sort()

figure()
title('dummy messages')
plot(distrib_dummy, 'x')
show()

figure()
title('real messages')
plot(distrib_real, 'x')

figure()
title('dummy + adapted real messages')
plot(distrib_dummy, '-')
mapped_x_axis = np.array(range(len(distrib_real))) * len(distrib_dummy) / float(len(distrib_real))
plot(mapped_x_axis, distrib_real, '-')

show()



In [21]:
distrib_real = pd.DataFrame(r[r['dummy'] == 0]['before'].copy())
distrib_real.sort()
distrib_real['group'] = pd.cut(distrib_real, bins=[2, 4, 6, 8, 10])
#distrib_real.set_index(['group'], inplace=True)
#distrib_real.unstack('group')
distrib_real.groupby('group')
distrib_real


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-b96ee4aefcbd> in <module>()
      1 distrib_real = pd.DataFrame(r[r['dummy'] == 0]['before'].copy())
      2 distrib_real.sort()
----> 3 distrib_real['group'] = pd.cut(distrib_real, bins=[2, 4, 6, 8, 10])
      4 #distrib_real.set_index(['group'], inplace=True)
      5 #distrib_real.unstack('group')

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2005         else:
   2006             # set column
-> 2007             self._set_item(key, value)
   2008 
   2009     def _setitem_slice(self, key, value):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2083         is_existing = key in self.columns
   2084         self._ensure_valid_index(value)
-> 2085         value = self._sanitize_column(key, value)
   2086         NDFrame._set_item(self, key, value)
   2087 

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value)
   2146                     value = com._possibly_convert_platform(value)
   2147                 else:
-> 2148                     value = com._asarray_tuplesafe(value)
   2149             elif isinstance(value, PeriodIndex):
   2150                 value = value.asobject

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
   2175         return lib.list_to_object_array(values)
   2176 
-> 2177     result = np.asarray(values, dtype=dtype)
   2178 
   2179     if issubclass(result.dtype.type, compat.string_types):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    458 
    459     """
--> 460     return array(a, dtype, copy=False, order=order)
    461 
    462 def asanyarray(a, dtype=None, order=None):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/categorical.py in __array__(self, dtype)
    126 
    127     def __array__(self, dtype=None):
--> 128         return com.take_1d(self.levels.values, self.labels)
    129 
    130     def __len__(self):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/common.py in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
    705                                  axis=axis, mask_info=mask_info)
    706 
--> 707     func(arr, indexer, out, fill_value)
    708 
    709     if flip_order:

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/algos.so in pandas.algos.take_1d_object_object (pandas/algos.c:77004)()

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [ ]: