This file contains scripts for CountMinSketch implementation testing

It uses prefix of a preprocessed wiki corpus (title_tokens) as a source, processing at most fixed amount of text. It counts the frequency of words using the python Counter (dictionary) for exact counts and then repeats the counting using Count-min sketch implementation. Basic statistics such as standard deviation are available


In [15]:
import bounter
import math
import smart_open
import numpy 

# some basic declarations about input

# process at most this number of articles 
max_articles = 100 
# process at most this number of total words
max_words = 150000000
#absolute path to corpus
wiki_file = 'C:/rare/corpus/wiki/title_tokens.txt.gz'

In [16]:
from bounter import CountMinSketch as CMS
cms = CMS(16, algorithm="basic")
print(cms.width, cms.depth)
print(cms.size() // (1024*1024))


1048576 4
16

In [17]:
# loads the counter from wiki file
# cnt (python counter) or cms (count-min sketch) can be None, it only loads into a non-empty counter
# returns total number of loaded words
def load(cnt, cms):
    wiki_input = smart_open.smart_open(wiki_file)
    wiki_input.seek(0)

    length = 0
    words = 0
    for lineno, line in enumerate(wiki_input):            
        length += 1                
        for word in line.decode().split('\t')[1].split():
            if cnt is not None:
                cnt[word] += 1
            if cms is not None:
                cms.increment(word)
            words += 1
        if (length >= max_articles or words >= max_words):
            break
    return words

In [18]:
# using cnt as counter with exact frequencies and cms as count-min sketch estimations,
# provides basic statistics about the accuracy of the estimations
# deviation: standard deviation of the estimations
# log_deviation: standard deviation using the logcounter value instead of the derived value

mks1024 = bounter.CountMinSketch(width=1, depth=1, algorithm='logcounter1024')

def stats(cnt, cms):
    variance = 0
    logvariance = 0
    total_estimated_freq = 0
    total_real_freq = 0
    total_keys = 0
    mds = 0
    md = 0
    mdw = ''
    mdc = 0
    
    for word, real_count in cnt.items():
        total_keys += 1
        estimated_count = cms[word]
        d = estimated_count - real_count
        logd = mks1024.log_encode(estimated_count) - mks1024.log_encode(real_count) 
        total_estimated_freq += estimated_count
        total_real_freq += real_count
        ds = d*d
        logds = logd * logd
        if ds > mds:
            md = d
            mds = ds
            mdw = word
            mdc = estimated_count
        variance += ds
        logvariance += logds
        
    deviation = math.sqrt(variance / total_keys)
    log_deviation = math.sqrt(logvariance / total_keys)
    
    ## Uncomment following lines for more detailed stats    
    #print("Total keys: %d" % total_keys)
    #print("Total frequency reported %d (~ %f), expected %d (~ %f)" 
    #      % (total_estimated_freq, total_estimated_freq / total_keys, total_real_freq, total_real_freq / total_keys))
    #print("Deviation: %f"% deviation)
    #print("Max error: %d on key %s (expected %d, got %d)"% (md, mdw, cnt[mdw], mdc))
    return total_keys, deviation, log_deviation

In [22]:
# From https://stackoverflow.com/a/38515297
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

In [23]:
from collections import Counter

# tests a single run with Counter already loaded
def test(cnt, size, algorithm = 'conservative'):
    cms = bounter.CountMinSketch(size, algorithm=algorithm)

    load(None, cms)
    (total_keys, deviation, log_deviation) = stats(cnt, cms)

    width = cms.width
    depth = cms.depth
    cms_size = 8*width*depth
    cnt_size = get_size(cnt)
    result = (width, depth, algorithm, deviation, log_deviation, cms.cardinality(), cms.sum)
    ## Uncomment for more test statistics
    #print("%d\t%d\t%s\t%f\t%f\t%d\t%d" % result)
    #print("Tested CMS with width %d (%f of words) and depth %d" % (width, width / total_keys, depth))
    #print("Deviation: %f" % deviation)
    #print("CMS size: %d bytes, compared to %d Counter size (%f)" % (cms_size, cnt_size, cms_size / cnt_size))
    return result

The following section can be used to ad-hoc test count-min sketch implementation using values


In [24]:
%%time
max_articles = 10
max_words = 150000000
cnt = Counter()
words = load(cnt, None)
print("Size of orig: %d entries in %d bytes from %d total words" % (len(cnt), get_size(cnt), words))
# modify parameters as you wish
result = test(cnt, 1, 'basic')

print("width: %d\ndepth: %d\nalgorithm: %s\ndeviation: %f\nLog deviation: %f\nCardinality: %d\nTotal: %d" % result)


Size of orig: 10700 entries in 905672 bytes from 64685 total words
width: 65536
depth: 4
algorithm: basic
deviation: 0.029002
Log deviation: 0.029002
Cardinality: 10654
Total: 64685
Wall time: 3.66 s

In [37]:
%%time
max_articles = 50
max_words = 150000000

cnt = Counter()
words = load(cnt, None)
results = []
print("Size of orig: %d entries in %d bytes from %d total words" % (len(cnt), get_size(cnt), words))
for size in [1,2]: # all different sizes
    for algorithm in ['basic', 'conservative', 'logcons1024']:
            result = test(cnt, size, algorithm)
            results.append(result)


Size of orig: 22559 entries in 2608750 bytes from 210713 total words
Wall time: 1min 5s

In [38]:
for result in results:
    print("%d\t%d\t%s\t%f\t%f\t%d\t%d" % result)


65536	4	basic	0.123845	0.123845	22741	210713
65536	4	conservative	0.065573	0.065573	22741	210713
131072	4	logcons1024	2.974357	0.450876	22741	210713
131072	4	basic	0.035854	0.035854	22741	210713
131072	4	conservative	0.018832	0.018832	22741	210713
262144	4	logcons1024	1.629538	0.255443	22741	210713

This can be used to manually sanity-test a logcounter implementation. The result should be in the same order of magnitude.


In [15]:
expected = 1000000
mks = bounded_counter.CountMinSketch(width=1, depth=1, algorithm='logcons1024')
for i in range(expected):
    mks.increment(1)
actual = mks[1]
print("Got %d (%f of original %d)" % (actual, actual / expected, expected))
log_actual = mks.log_encode(actual)
log_expected = mks.log_encode(expected)
print("Counter difference: %d (expected %d, actual %d)" % (log_actual - log_expected, log_expected, log_actual))


Got 997376 (0.997376 of original 1000000)
Counter difference: -5 (expected 11169, actual 11164)

In [ ]: