This file contains scripts for CountMinSketch implementation testing
It uses prefix of a preprocessed wiki corpus (title_tokens) as a source, processing at most fixed amount of text. It counts the frequency of words using the python Counter (dictionary) for exact counts and then repeats the counting using Count-min sketch implementation. Basic statistics such as standard deviation are available
In [15]:
import bounter
import math
import smart_open
import numpy
# some basic declarations about input
# process at most this number of articles
max_articles = 100
# process at most this number of total words
max_words = 150000000
#absolute path to corpus
wiki_file = 'C:/rare/corpus/wiki/title_tokens.txt.gz'
In [16]:
from bounter import CountMinSketch as CMS
cms = CMS(16, algorithm="basic")
print(cms.width, cms.depth)
print(cms.size() // (1024*1024))
In [17]:
# loads the counter from wiki file
# cnt (python counter) or cms (count-min sketch) can be None, it only loads into a non-empty counter
# returns total number of loaded words
def load(cnt, cms):
wiki_input = smart_open.smart_open(wiki_file)
wiki_input.seek(0)
length = 0
words = 0
for lineno, line in enumerate(wiki_input):
length += 1
for word in line.decode().split('\t')[1].split():
if cnt is not None:
cnt[word] += 1
if cms is not None:
cms.increment(word)
words += 1
if (length >= max_articles or words >= max_words):
break
return words
In [18]:
# using cnt as counter with exact frequencies and cms as count-min sketch estimations,
# provides basic statistics about the accuracy of the estimations
# deviation: standard deviation of the estimations
# log_deviation: standard deviation using the logcounter value instead of the derived value
mks1024 = bounter.CountMinSketch(width=1, depth=1, algorithm='logcounter1024')
def stats(cnt, cms):
variance = 0
logvariance = 0
total_estimated_freq = 0
total_real_freq = 0
total_keys = 0
mds = 0
md = 0
mdw = ''
mdc = 0
for word, real_count in cnt.items():
total_keys += 1
estimated_count = cms[word]
d = estimated_count - real_count
logd = mks1024.log_encode(estimated_count) - mks1024.log_encode(real_count)
total_estimated_freq += estimated_count
total_real_freq += real_count
ds = d*d
logds = logd * logd
if ds > mds:
md = d
mds = ds
mdw = word
mdc = estimated_count
variance += ds
logvariance += logds
deviation = math.sqrt(variance / total_keys)
log_deviation = math.sqrt(logvariance / total_keys)
## Uncomment following lines for more detailed stats
#print("Total keys: %d" % total_keys)
#print("Total frequency reported %d (~ %f), expected %d (~ %f)"
# % (total_estimated_freq, total_estimated_freq / total_keys, total_real_freq, total_real_freq / total_keys))
#print("Deviation: %f"% deviation)
#print("Max error: %d on key %s (expected %d, got %d)"% (md, mdw, cnt[mdw], mdc))
return total_keys, deviation, log_deviation
In [22]:
# From https://stackoverflow.com/a/38515297
import sys
def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, '__dict__'):
size += get_size(obj.__dict__, seen)
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum([get_size(i, seen) for i in obj])
return size
In [23]:
from collections import Counter
# tests a single run with Counter already loaded
def test(cnt, size, algorithm = 'conservative'):
cms = bounter.CountMinSketch(size, algorithm=algorithm)
load(None, cms)
(total_keys, deviation, log_deviation) = stats(cnt, cms)
width = cms.width
depth = cms.depth
cms_size = 8*width*depth
cnt_size = get_size(cnt)
result = (width, depth, algorithm, deviation, log_deviation, cms.cardinality(), cms.sum)
## Uncomment for more test statistics
#print("%d\t%d\t%s\t%f\t%f\t%d\t%d" % result)
#print("Tested CMS with width %d (%f of words) and depth %d" % (width, width / total_keys, depth))
#print("Deviation: %f" % deviation)
#print("CMS size: %d bytes, compared to %d Counter size (%f)" % (cms_size, cnt_size, cms_size / cnt_size))
return result
The following section can be used to ad-hoc test count-min sketch implementation using values
In [24]:
%%time
max_articles = 10
max_words = 150000000
cnt = Counter()
words = load(cnt, None)
print("Size of orig: %d entries in %d bytes from %d total words" % (len(cnt), get_size(cnt), words))
# modify parameters as you wish
result = test(cnt, 1, 'basic')
print("width: %d\ndepth: %d\nalgorithm: %s\ndeviation: %f\nLog deviation: %f\nCardinality: %d\nTotal: %d" % result)
In [37]:
%%time
max_articles = 50
max_words = 150000000
cnt = Counter()
words = load(cnt, None)
results = []
print("Size of orig: %d entries in %d bytes from %d total words" % (len(cnt), get_size(cnt), words))
for size in [1,2]: # all different sizes
for algorithm in ['basic', 'conservative', 'logcons1024']:
result = test(cnt, size, algorithm)
results.append(result)
In [38]:
for result in results:
print("%d\t%d\t%s\t%f\t%f\t%d\t%d" % result)
This can be used to manually sanity-test a logcounter implementation. The result should be in the same order of magnitude.
In [15]:
expected = 1000000
mks = bounded_counter.CountMinSketch(width=1, depth=1, algorithm='logcons1024')
for i in range(expected):
mks.increment(1)
actual = mks[1]
print("Got %d (%f of original %d)" % (actual, actual / expected, expected))
log_actual = mks.log_encode(actual)
log_expected = mks.log_encode(expected)
print("Counter difference: %d (expected %d, actual %d)" % (log_actual - log_expected, log_expected, log_actual))
In [ ]: