In [1]:
bson_file = "/tmp/data/twitter_sample.bson.gz"
json_file = "/tmp/data/twitter_sample.json.gz"
In [2]:
import gzip
import re
In [3]:
from bsonstream import KeyValueBSONInput
import bsonsearch
import bson
def test_bson():
bc = bsonsearch.bsoncompare()
bson_fh = gzip.open(bson_file,"rb")
stream = KeyValueBSONInput(fh=bson_fh, decode=False)
query = {"text":re.compile(r".*you.*", re.IGNORECASE)}
matcher = bc.generate_matcher(query)
for doc in stream:
if bc.match(matcher, doc):
yield bson.BSON.decode(bson.BSON(doc))
bson_fh.close()
In [4]:
import ujson #ultrajson is the fastest json decoder i've seen.
def test_json():
json_fh = gzip.open(json_file, "rb")
query = re.compile(r".*you.*", re.IGNORECASE)
for line in json_fh:
decoded_json = ujson.decode(line)
try:
if query.match(decoded_json['text']):
yield decoded_json
except KeyError: #"text" key is not guaranteed in document
pass
json_fh.close()
In [5]:
%%timeit
result_bson = [x for x in test_bson()]
In [6]:
%%timeit
result_json = [x for x in test_json()]
Not surprising that bson outperformed.
BSON only completely decodes the document/dict if the document matches the query.
DICT/Json compare has to deserialize every document before perfoming the check.
utf8 string decode/encode is notoriously slow and should be avoided, especially for something as text rich as tweets
In [ ]: