Memory consumption


In [1]:
import collections
import subprocess
import itertools
import os
import time

import madoka
import numpy as np
import redis


ALPHANUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
NUM_ALPHANUM_COMBINATION = 238328
zipf_array = np.random.zipf(1.5, NUM_ALPHANUM_COMBINATION)

In [2]:
def python_memory_usage():
    return int(subprocess.getoutput('ps up %s' % os.getpid()).split()[15])

def redis_memory_usage():
    lines = subprocess.getoutput('ps').splitlines()
    for line in lines:
        if 'redis-server' in line:
            pid = line.split()[0]
            break
    return int(subprocess.getoutput('ps up %s' % pid).split()[15])

def count(counter):
    for (i, chars) in enumerate(itertools.product(ALPHANUM, repeat=3)):
        chars = ''.join(chars)
        counter[chars] = int(zipf_array[i])
    return counter


def benchmark(counter, start_mem_usage):
    counter = count(counter)
    end_mem_usage = python_memory_usage()
    diff = end_mem_usage - start_mem_usage
    print('memory consumption is {:,d} KB'.format(diff))
    return counter


def redis_benchmark():
    db = redis.Redis()
    db.flushall()
    start_mem_usage = redis_memory_usage()
    with db.pipeline() as pipe:
        for (i, chars) in enumerate(itertools.product(ALPHANUM, repeat=3)):
            chars = ''.join(chars)
            pipe.set(chars, int(zipf_array[i]))
        pipe.execute()
    end_mem_usage = redis_memory_usage()
    diff = end_mem_usage - start_mem_usage
    print('memory consumption is {:,d} KB'.format(diff))

In [3]:
print('collections.Counter')
start_mem_usage = python_memory_usage()
start_time = time.process_time()
counter = collections.Counter()
benchmark(counter, start_mem_usage)
end_time = time.process_time()
print('Processsing Time is %5f sec.' % (end_time - start_time))
del counter

print('*' * 30)
print('madoka.Sketch')
start_mem_usage = python_memory_usage()
start_time = time.process_time()
sketch = madoka.Sketch()
benchmark(sketch, start_mem_usage)
end_time = time.process_time()
print('Processsing Time is %5f sec.' % (end_time - start_time))
del sketch

print('*' * 30)
print('Redis')
start_time = time.process_time()
redis_benchmark()
end_time = time.process_time()
print('Processsing Time is %5f sec.' % (end_time - start_time))


collections.Counter
memory consumption is 32,784 KB
Processsing Time is 0.196528 sec.
******************************
madoka.Sketch
memory consumption is 13,316 KB
Processsing Time is 0.332484 sec.
******************************
Redis
memory consumption is 17,416 KB
Processsing Time is 4.106088 sec.

Counting error rate


In [4]:
sketch = madoka.Sketch()
diffs = []
for (i, chars) in enumerate(itertools.product(ALPHANUM, repeat=3)):
    chars = ''.join(chars)
    sketch[chars] = int(zipf_array[i])
    diff = abs(sketch[chars] - int(zipf_array[i]))
    if diff > 0:
        diffs.append(diff / int(zipf_array[i]) * 100)
    else:
        diffs.append(0)

In [5]:
print(np.average(diffs))


0.09119661403360393

Conclusion

Memory consumption

[Low] madoka.Sketch < Redis < collections.Counter [High]

Counting error rate

About 0.0911 %


In [ ]: