In [21]:
from IPython.display import HTML
HTML('''<h1>Table of Contents</h1>

<ol>
    <li><a href='#Comparing-real-and-dummy-delays'>Comparison of delays between real and dummies</a></li>
</ol>
<hr/>''')





In [22]:
%pylab inline
from __future__ import print_function
if not 'xrange' in globals():
    xrange = range

import sys, os
# Add parent path
sys.path.append(os.getcwd()[:-len('/notebooks')])


Populating the interactive namespace from numpy and matplotlib

In [23]:
# Large plots
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 16, 9

In [24]:
import numpy as np
import pandas as pd

In [25]:
from hubbub.generator.generator import Simulator
from hubbub.generator.heartbeat import HeartBeatSimulator
from hubbub.datasets.simulations import simple_log, gauss_log, SIMPLE_LOG as SIMPLE_LOG_SAMPLE

In [26]:
#SIMPLE_LOG = SIMPLE_LOG_SAMPLE
SIMPLE_LOG_SAMPLE


Out[26]:
[(datetime.datetime(2000, 1, 1, 0, 1, 1, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 9, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 2, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 4, 8, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 5, 3, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 6, 7, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 7, 4, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 8, 6, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 9, 5, 123456), 10),
 (datetime.datetime(2000, 1, 1, 0, 10, 0, 123456), 10)]

In [27]:
# Generating "real" messages dataset:
SIMPLE_LOG = simple_log(n=2000)
#SIMPLE_LOG = gauss_log(n=200)
SIMPLE_LOG[:10]


Out[27]:
[(datetime.datetime(2000, 1, 1, 0, 1, 13, 663242), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 24, 407889), 10),
 (datetime.datetime(2000, 1, 1, 0, 2, 56, 331186), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 7, 556561), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 10, 532114), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 34, 527495), 10),
 (datetime.datetime(2000, 1, 1, 0, 3, 51, 413882), 10),
 (datetime.datetime(2000, 1, 1, 0, 4, 47, 92028), 10),
 (datetime.datetime(2000, 1, 1, 0, 6, 16, 166927), 10),
 (datetime.datetime(2000, 1, 1, 0, 8, 39, 529666), 10)]

Running simulator


In [28]:
HeartBeatSimulator.period = 0.5
result_sm = Simulator(SIMPLE_LOG).run()
results_HB = [
    HeartBeatSimulator(SIMPLE_LOG).run() for i in xrange(5)
#    HeartBeatSimulator(SIMPLE_LOG).run(delay=lambda: 5) for i in xrange(10)
    ]

results_HB[0][:10]


Out[28]:
[(datetime.datetime(2000, 1, 1, 0, 0), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 0, 373978), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 0, 978932), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 1, 183042), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 1, 213841), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 1, 479057), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 1, 977408), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 2, 209689), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 2, 884429), 10),
 (datetime.datetime(2000, 1, 1, 0, 0, 3, 459555), 10)]

Converting results to timestamps for plotting:


In [29]:
import time
def timestamp(n):
    unix_time = time.mktime(n.timetuple()) + n.microsecond/1000000.
    return unix_time

In [30]:
r_real = pd.DataFrame(
    [(0, timestamp(i[0]), 'SIMPLE_LOG') for i in SIMPLE_LOG],
    columns=('dummy', 'timestamp', 'source'),
    )
r_real


Out[30]:
dummy timestamp source
0 0 9.466813e+08 SIMPLE_LOG
1 0 9.466813e+08 SIMPLE_LOG
2 0 9.466814e+08 SIMPLE_LOG
3 0 9.466814e+08 SIMPLE_LOG
4 0 9.466814e+08 SIMPLE_LOG
5 0 9.466814e+08 SIMPLE_LOG
6 0 9.466814e+08 SIMPLE_LOG
7 0 9.466815e+08 SIMPLE_LOG
8 0 9.466816e+08 SIMPLE_LOG
9 0 9.466817e+08 SIMPLE_LOG
10 0 9.466818e+08 SIMPLE_LOG
11 0 9.466819e+08 SIMPLE_LOG
12 0 9.466819e+08 SIMPLE_LOG
13 0 9.466819e+08 SIMPLE_LOG
14 0 9.466819e+08 SIMPLE_LOG
15 0 9.466820e+08 SIMPLE_LOG
16 0 9.466820e+08 SIMPLE_LOG
17 0 9.466820e+08 SIMPLE_LOG
18 0 9.466821e+08 SIMPLE_LOG
19 0 9.466821e+08 SIMPLE_LOG
20 0 9.466822e+08 SIMPLE_LOG
21 0 9.466822e+08 SIMPLE_LOG
22 0 9.466822e+08 SIMPLE_LOG
23 0 9.466823e+08 SIMPLE_LOG
24 0 9.466823e+08 SIMPLE_LOG
25 0 9.466824e+08 SIMPLE_LOG
26 0 9.466824e+08 SIMPLE_LOG
27 0 9.466824e+08 SIMPLE_LOG
28 0 9.466825e+08 SIMPLE_LOG
29 0 9.466825e+08 SIMPLE_LOG
... ... ... ...
1970 0 9.467662e+08 SIMPLE_LOG
1971 0 9.467662e+08 SIMPLE_LOG
1972 0 9.467662e+08 SIMPLE_LOG
1973 0 9.467663e+08 SIMPLE_LOG
1974 0 9.467664e+08 SIMPLE_LOG
1975 0 9.467665e+08 SIMPLE_LOG
1976 0 9.467666e+08 SIMPLE_LOG
1977 0 9.467666e+08 SIMPLE_LOG
1978 0 9.467666e+08 SIMPLE_LOG
1979 0 9.467666e+08 SIMPLE_LOG
1980 0 9.467667e+08 SIMPLE_LOG
1981 0 9.467667e+08 SIMPLE_LOG
1982 0 9.467668e+08 SIMPLE_LOG
1983 0 9.467668e+08 SIMPLE_LOG
1984 0 9.467669e+08 SIMPLE_LOG
1985 0 9.467669e+08 SIMPLE_LOG
1986 0 9.467670e+08 SIMPLE_LOG
1987 0 9.467670e+08 SIMPLE_LOG
1988 0 9.467672e+08 SIMPLE_LOG
1989 0 9.467672e+08 SIMPLE_LOG
1990 0 9.467672e+08 SIMPLE_LOG
1991 0 9.467672e+08 SIMPLE_LOG
1992 0 9.467673e+08 SIMPLE_LOG
1993 0 9.467673e+08 SIMPLE_LOG
1994 0 9.467673e+08 SIMPLE_LOG
1995 0 9.467674e+08 SIMPLE_LOG
1996 0 9.467676e+08 SIMPLE_LOG
1997 0 9.467676e+08 SIMPLE_LOG
1998 0 9.467676e+08 SIMPLE_LOG
1999 0 9.467676e+08 SIMPLE_LOG

2000 rows × 3 columns


In [31]:
r_dummyHB = [
    pd.DataFrame(
        [(1, timestamp(i[0]), 'HB{}'.format(index)) for i in r],
        columns=('dummy', 'timestamp', 'source'),
        )
    for index, r in enumerate(results_HB)
    ]
r_dummyHB[0]


Out[31]:
dummy timestamp source
0 1 9.466812e+08 HB0
1 1 9.466812e+08 HB0
2 1 9.466812e+08 HB0
3 1 9.466812e+08 HB0
4 1 9.466812e+08 HB0
5 1 9.466812e+08 HB0
6 1 9.466812e+08 HB0
7 1 9.466812e+08 HB0
8 1 9.466812e+08 HB0
9 1 9.466812e+08 HB0
10 1 9.466812e+08 HB0
11 1 9.466812e+08 HB0
12 1 9.466812e+08 HB0
13 1 9.466812e+08 HB0
14 1 9.466812e+08 HB0
15 1 9.466812e+08 HB0
16 1 9.466812e+08 HB0
17 1 9.466812e+08 HB0
18 1 9.466812e+08 HB0
19 1 9.466812e+08 HB0
20 1 9.466812e+08 HB0
21 1 9.466812e+08 HB0
22 1 9.466812e+08 HB0
23 1 9.466812e+08 HB0
24 1 9.466812e+08 HB0
25 1 9.466812e+08 HB0
26 1 9.466812e+08 HB0
27 1 9.466812e+08 HB0
28 1 9.466812e+08 HB0
29 1 9.466812e+08 HB0
... ... ... ...
258944 1 9.467676e+08 HB0
258945 1 9.467676e+08 HB0
258946 1 9.467676e+08 HB0
258947 1 9.467676e+08 HB0
258948 1 9.467676e+08 HB0
258949 1 9.467676e+08 HB0
258950 1 9.467676e+08 HB0
258951 1 9.467676e+08 HB0
258952 1 9.467676e+08 HB0
258953 1 9.467676e+08 HB0
258954 1 9.467676e+08 HB0
258955 1 9.467676e+08 HB0
258956 1 9.467676e+08 HB0
258957 1 9.467676e+08 HB0
258958 1 9.467676e+08 HB0
258959 1 9.467676e+08 HB0
258960 1 9.467676e+08 HB0
258961 1 9.467676e+08 HB0
258962 1 9.467676e+08 HB0
258963 1 9.467676e+08 HB0
258964 1 9.467676e+08 HB0
258965 1 9.467676e+08 HB0
258966 1 9.467676e+08 HB0
258967 1 9.467676e+08 HB0
258968 1 9.467676e+08 HB0
258969 1 9.467676e+08 HB0
258970 1 9.467676e+08 HB0
258971 1 9.467676e+08 HB0
258972 1 9.467676e+08 HB0
258973 1 9.467676e+08 HB0

258974 rows × 3 columns

Analyzing delays

Plotting


In [32]:
r = r_dummyHB[0]
figure()
plot(r['timestamp'][:100].diff(), 'o-')
show()

figure()
for r in r_dummyHB:
    plot(r['timestamp'][:100].diff(), 'o')
show()



In [33]:
r_mixed = [
    pd.concat((r_real, r))
    for r in r_dummyHB
]
for r in r_mixed:
    r.sort('timestamp', inplace=True)
    r['before'] = r['timestamp'].diff()
    r['after'] = -r['timestamp'].diff(periods=-1)

r_mixed[0].head(10)


Out[33]:
dummy timestamp source before after
0 1 9.466812e+08 HB0 NaN 0.373978
1 1 9.466812e+08 HB0 0.373978 0.604954
2 1 9.466812e+08 HB0 0.604954 0.204110
3 1 9.466812e+08 HB0 0.204110 0.030799
4 1 9.466812e+08 HB0 0.030799 0.265216
5 1 9.466812e+08 HB0 0.265216 0.498351
6 1 9.466812e+08 HB0 0.498351 0.232281
7 1 9.466812e+08 HB0 0.232281 0.674740
8 1 9.466812e+08 HB0 0.674740 0.575126
9 1 9.466812e+08 HB0 0.575126 0.345917

Comparing real and dummy delays


In [34]:
print(r_real['timestamp'][0])
r = r_mixed[0]
plot(r['before'])
r[r['dummy'] == 0].head(10)


946681273.663
Out[34]:
dummy timestamp source before after
0 0 9.466813e+08 SIMPLE_LOG 0.734094 0.002976
1 0 9.466813e+08 SIMPLE_LOG 0.490153 0.255865
2 0 9.466814e+08 SIMPLE_LOG 0.495769 0.431246
3 0 9.466814e+08 SIMPLE_LOG 0.393599 0.022506
4 0 9.466814e+08 SIMPLE_LOG 0.386471 0.145003
5 0 9.466814e+08 SIMPLE_LOG 0.281970 0.087130
6 0 9.466814e+08 SIMPLE_LOG 0.041947 0.293391
7 0 9.466815e+08 SIMPLE_LOG 0.409655 0.137777
8 0 9.466816e+08 SIMPLE_LOG 0.121623 0.552608
9 0 9.466817e+08 SIMPLE_LOG 0.264374 0.579225
<matplotlib.figure.Figure at 0x1168a0a90>

Delay before


In [35]:
# Average delay

if False:
    for r in r_mixed:
        print('all  mean', r['before'].mean())
        print('all  std ', r['before'].std())
        print('dumm mean', r[r['dummy'] == 1]['before'].mean())
        print('dumm std ', r[r['dummy'] == 1]['before'].std())
        print('real mean', r[r['dummy'] == 0]['before'].mean())
        print('real std ', r[r['dummy'] == 0]['before'].std())
        print

bar(
    range(0, len(r_mixed)*3, 3), 
    [r['before'].mean() for r in r_mixed], 
    yerr=[r['before'].std() for r in r_mixed], 
    color='g',
    )
    
bar(
    range(1, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 1]['before'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 1]['before'].std() for r in r_mixed], 
    color='y',
    )

bar(
    range(2, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 0]['before'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 0]['before'].std() for r in r_mixed], 
    color='r',
    )


Out[35]:
<Container object of 5 artists>

Delay After


In [36]:
# Average delay

if False:
    for r in r_mixed:
        print('all  mean', r['after'].mean())
        print('all  std ', r['after'].std())
        print('dumm mean', r[r['dummy'] == 1]['after'].mean())
        print('dumm std ', r[r['dummy'] == 1]['after'].std())
        print('real mean', r[r['dummy'] == 0]['after'].mean())
        print('real std ', r[r['dummy'] == 0]['after'].std())
        print

bar(
    range(0, len(r_mixed)*3, 3), 
    [r['after'].mean() for r in r_mixed], 
    yerr=[r['after'].std() for r in r_mixed], 
    color='g',
    )
    
bar(
    range(1, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 1]['after'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 1]['after'].std() for r in r_mixed], 
    color='y',
    )

bar(
    range(2, len(r_mixed)*3, 3), 
    [r[r['dummy'] == 0]['after'].mean() for r in r_mixed], 
    yerr=[r[r['dummy'] == 0]['after'].std() for r in r_mixed], 
    color='r',
    )


Out[36]:
<Container object of 5 artists>

Delay over time


In [37]:
r = r_mixed[0]
figure()
plot(r['before'][:100], 'o-')
show()

figure()
for r in r_mixed:
    plot(r['before'][:100], 'o')
show()


Delay distribution (before)


In [38]:
distrib_real = r[r['dummy'] == 0]['before'].copy()
distrib_real.sort()

distrib_dummy = r[r['dummy'] == 1]['before'].copy()
distrib_dummy.sort()

figure()
title('dummy messages')
plot(distrib_dummy, '-')
show()

figure()
title('real messages')
plot(distrib_real, '-')

figure()
title('dummy + adapted real messages')
plot(distrib_dummy, '-')
mapped_x_axis = np.array(range(len(distrib_real))) * len(distrib_dummy) / float(len(distrib_real))
plot(mapped_x_axis, distrib_real, '-')

show()


Delay distribution (after)


In [39]:
distrib_real = r[r['dummy'] == 0]['after'].copy()
distrib_real.sort()

distrib_dummy = r[r['dummy'] == 1]['after'].copy()
distrib_dummy.sort()

figure()
title('dummy messages')
plot(distrib_dummy, '-')
show()

figure()
title('real messages')
plot(distrib_real, '-')

figure()
title('dummy + adapted real messages')
plot(distrib_dummy, '-')
mapped_x_axis = np.array(range(len(distrib_real))) * len(distrib_dummy) / float(len(distrib_real))
plot(mapped_x_axis, distrib_real, '-')

show()



In [40]:
distrib_real = pd.DataFrame(r[r['dummy'] == 0]['before'].copy())
distrib_real.sort()
distrib_real['group'] = pd.cut(distrib_real, bins=[2, 4, 6, 8, 10])
#distrib_real.set_index(['group'], inplace=True)
#distrib_real.unstack('group')
distrib_real.groupby('group')
distrib_real


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-b96ee4aefcbd> in <module>()
      1 distrib_real = pd.DataFrame(r[r['dummy'] == 0]['before'].copy())
      2 distrib_real.sort()
----> 3 distrib_real['group'] = pd.cut(distrib_real, bins=[2, 4, 6, 8, 10])
      4 #distrib_real.set_index(['group'], inplace=True)
      5 #distrib_real.unstack('group')

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2005         else:
   2006             # set column
-> 2007             self._set_item(key, value)
   2008 
   2009     def _setitem_slice(self, key, value):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2083         is_existing = key in self.columns
   2084         self._ensure_valid_index(value)
-> 2085         value = self._sanitize_column(key, value)
   2086         NDFrame._set_item(self, key, value)
   2087 

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value)
   2146                     value = com._possibly_convert_platform(value)
   2147                 else:
-> 2148                     value = com._asarray_tuplesafe(value)
   2149             elif isinstance(value, PeriodIndex):
   2150                 value = value.asobject

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
   2175         return lib.list_to_object_array(values)
   2176 
-> 2177     result = np.asarray(values, dtype=dtype)
   2178 
   2179     if issubclass(result.dtype.type, compat.string_types):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    458 
    459     """
--> 460     return array(a, dtype, copy=False, order=order)
    461 
    462 def asanyarray(a, dtype=None, order=None):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/categorical.py in __array__(self, dtype)
    126 
    127     def __array__(self, dtype=None):
--> 128         return com.take_1d(self.levels.values, self.labels)
    129 
    130     def __len__(self):

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/core/common.py in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
    705                                  axis=axis, mask_info=mask_info)
    706 
--> 707     func(arr, indexer, out, fill_value)
    708 
    709     if flip_order:

/Users/okso/.virtualenvs/three/lib/python3.4/site-packages/pandas/algos.so in pandas.algos.take_1d_object_object (pandas/algos.c:77004)()

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [ ]: