In [1]:
import uuid 
import random
import hyperloglog
import pickle
from cassandra.cluster import Cluster
import unittest
import redis

In [15]:
Redis = redis.StrictRedis(host='localhost', port = 6379, db = 0)

In [19]:
Redis.set('foo', ['bar1', 'bar2'])
t = Redis.get('foo')
type(t)


Out[19]:
str

In [47]:
class RedisFramework: 
    def __init__(self): 
        self.user_group = {}
        self.uR = redis.StrictRedis(host = 'localhost', port = 6379, db = 0 )
        self.hllR = redis.StrictRedis(host = 'localhost', port = 6379, db = 0 )
        ### initialize user_group
        for i in range(100):
            self.user_group[i] = []

        self.group_hll = {}
        self.actual_hll = {}
        self.thresholds = [float(random.randint(20, 80))/100 for i in range(100)]
        
    def INSERT(self):
        for j in range(100000):
            uid = uuid.uuid4()
            r = random.random()
            for i in range(100):
                if r > self.thresholds[i]: 
                    self.user_group[int(i)].append(str(uid))
        

        for key in self.user_group: 
            self.uR.set(key, self.user_group[key])
            hll = hyperloglog.HyperLogLog(0.01)
            users = self.user_group[key]
            self.actual_hll[key] = len(users)
            for user in users:
                hll.add(str(user))
            pickled = pickle.dumps(hll)
            self.hllR.set(key, pickled)
    
    def GETHLLCARDINALITY(self, groupID):
        pickled = self.hllR.get(groupID)
        return len(pickle.loads(pickled))
    
    def GETTRUECARDINALITY(self, groupID):
        return self.actual_hll[groupID]

Get thresholds for groups (between 20 and 80 %)


In [48]:
class testCardinalityErrorRates(unittest.TestCase):
    def lessThan10Error(self): 
        for error in x: 
            self.assertTrue(error < .1)

In [49]:
rtest = RedisFramework()

In [50]:
rtest.INSERT()

In [51]:
rtest.GETHLLCARDINALITY(10)


Out[51]:
33285

In [52]:
rtest.GETTRUECARDINALITY(10)


Out[52]:
33043

CASSIE


In [2]:
cluster = Cluster()
metadata= cluster.metadata
session= cluster.connect()

In [4]:
keyname = "newkeyspace"
session.execute("CREATE KEYSPACE IF NOT EXISTS "+keyname +
                " WITH replication = {'class':'SimpleStrategy', 'replication_factor':1};")

In [36]:
session.set_keyspace(keyname)
mytable = "test"
#session.execute("CREATE TABLE users (id int PRIMARY KEY, location address)")
session.execute(" CREATE TABLE  " + mytable+
                " (groupID int PRIMARY KEY, userGroups set<uuid>, hll text);")

In [39]:
for key in d:
    hll = hyperloglog.HyperLogLog(0.01)
    for item in d[key]:
        hll.add(item)
    pickled = pickle.dumps(hll).encode("hex")
    session.execute("""INSERT INTO test (groupID, userGroups, hll)
    VALUES (%(groupID)s, %(userGroups)s, %(hll)s)
    """, 
    {'groupID':key, 'userGroups':d[key], 'hll':pickled }
    )


---------------------------------------------------------------------------
SyntaxException                           Traceback (most recent call last)
<ipython-input-39-b38cb3d59aa5> in <module>()
      7     VALUES (%(groupID)s, %(userGroups)s, %(hll)s)
      8     """, 
----> 9     {'groupID':key, 'userGroups':d[key], 'hll':pickled }
     10     )

/usr/local/lib/python2.7/dist-packages/cassandra/cluster.pyc in execute(self, query, parameters, timeout, trace)
   1403         future = self.execute_async(query, parameters, trace)
   1404         try:
-> 1405             result = future.result(timeout)
   1406         finally:
   1407             if trace:

/usr/local/lib/python2.7/dist-packages/cassandra/cluster.pyc in result(self, timeout)
   2974                     return PagedResult(self, self._final_result, timeout)
   2975             elif self._final_exception:
-> 2976                 raise self._final_exception
   2977             else:
   2978                 raise OperationTimedOut(errors=self._errors, last_host=self._current_host)

SyntaxException: <ErrorMessage code=2000 [Syntax error in CQL query] message="line 2:64 missing EOF at ','">

In [18]:
# MAKE ALL DATA STRUCTURES 
thresholds = [float(random.randint(20, 80))/100 for i in range(100)]
d = {}
from sets import Set
for j in range(100):
    uid = uuid.uuid4()
    r = random.random()
    for i in range(100):
        if r > thresholds[i]: 
            if i in d:
                d[i].add(uid)
            else:
                d[i]= Set([uid])

In [20]:
results = session.execute("SELECT * FROM " +mytable)
for row in results:
    if row.groupid == 10:
        pick = pickle.loads(row.hll)
len(pick)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-c4c794d19b1c> in <module>()
      3     if row.groupid == 10:
      4         pick = pickle.loads(row.hll)
----> 5 len(pick)

NameError: name 'pick' is not defined