A Bloom filter is a data structure that is used to check for membership of an element x in a set of m elements.
http://prakhar.me/articles/bloom-filters-for-dummies/
http://axiak.github.io/pybloomfiltermmap/
Download from: https://pypi.python.org/pypi/pybloom/1.0.2 pip install ez_setup python setup.py install
In [3]:
from pybloom import BloomFilter
f = BloomFilter(capacity=10000, error_rate=0.001)
for i in xrange(0, f.capacity):
_ = f.add(i)
0 in f
Out[3]:
In [4]:
f.capacity in f
Out[4]:
In [5]:
len(f) <= f.capacity
Out[5]:
In [6]:
abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
Out[6]:
In [7]:
from pybloom import ScalableBloomFilter
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
count = 10000
for i in xrange(0, count):
_ = sbf.add(i)
sbf.capacity > count
Out[7]:
In [8]:
len(sbf) <= count
Out[8]:
In [9]:
abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
Out[9]:
In [14]:
from pybloom import BloomFilter
import os
import re
POST_DIR = 'data/posts/'
# Read all my posts.
posts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}
# Create a dictionary of {"post name": "lowercase word set"}.
split_posts = {name: set(re.split("\W+", contents.lower())) for name, contents in posts.items()}
In [15]:
filters = {}
for name, words in split_posts.items():
filters[name] = BloomFilter(capacity=len(words), error_rate=0.1)
for word in words:
filters[name].add(word)
In [20]:
def search(search_string):
search_terms = re.split("\W+", search_string)
return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]
In [24]:
search("nothing")
Out[24]:
In [25]:
search("ice")
Out[25]:
In [ ]:
# %load data\posts\3.txt
Vacation Time
In [ ]:
# %load data\posts\4.txt
Ice cream is always good
In [35]:
sum(len(filter.bitarray.tobytes()) for filter in filters.values()) / len(filters)
Out[35]:
In [ ]: