In [1]:
bson_file = "/tmp/data/twitter_sample.bson.gz"
json_file = "/tmp/data/twitter_sample.json.gz"

In [2]:
import gzip
import re

In [3]:
from bsonstream import KeyValueBSONInput
import bsonsearch
import bson
def test_bson():
    bc = bsonsearch.bsoncompare()
    bson_fh = gzip.open(bson_file,"rb")
    stream = KeyValueBSONInput(fh=bson_fh, decode=False)
    query = {"text":re.compile(r".*you.*", re.IGNORECASE)}
    matcher = bc.generate_matcher(query)
    for doc in stream:
        if bc.match(matcher, doc):
            yield bson.BSON.decode(bson.BSON(doc))
    bson_fh.close()

In [4]:
import ujson #ultrajson is the fastest json decoder i've seen.  
def test_json():
    json_fh = gzip.open(json_file, "rb")
    query = re.compile(r".*you.*", re.IGNORECASE) 
    for line in json_fh:
        decoded_json = ujson.decode(line)
        try:
            if query.match(decoded_json['text']):
                yield decoded_json
        except KeyError: #"text" key is not guaranteed in document
            pass
    json_fh.close()

In [5]:
%%timeit
result_bson = [x for x in test_bson()]


100 loops, best of 3: 5.56 ms per loop

In [6]:
%%timeit
result_json = [x for x in test_json()]


100 loops, best of 3: 11.3 ms per loop

Not surprising that bson outperformed.

BSON only completely decodes the document/dict if the document matches the query.

DICT/Json compare has to deserialize every document before perfoming the check.

utf8 string decode/encode is notoriously slow and should be avoided, especially for something as text rich as tweets


In [ ]: