In [1]:
import uuid
import random
import hyperloglog
import pickle
from cassandra.cluster import Cluster
import unittest
import redis
In [15]:
Redis = redis.StrictRedis(host='localhost', port = 6379, db = 0)
In [19]:
Redis.set('foo', ['bar1', 'bar2'])
t = Redis.get('foo')
type(t)
Out[19]:
In [47]:
class RedisFramework:
def __init__(self):
self.user_group = {}
self.uR = redis.StrictRedis(host = 'localhost', port = 6379, db = 0 )
self.hllR = redis.StrictRedis(host = 'localhost', port = 6379, db = 0 )
### initialize user_group
for i in range(100):
self.user_group[i] = []
self.group_hll = {}
self.actual_hll = {}
self.thresholds = [float(random.randint(20, 80))/100 for i in range(100)]
def INSERT(self):
for j in range(100000):
uid = uuid.uuid4()
r = random.random()
for i in range(100):
if r > self.thresholds[i]:
self.user_group[int(i)].append(str(uid))
for key in self.user_group:
self.uR.set(key, self.user_group[key])
hll = hyperloglog.HyperLogLog(0.01)
users = self.user_group[key]
self.actual_hll[key] = len(users)
for user in users:
hll.add(str(user))
pickled = pickle.dumps(hll)
self.hllR.set(key, pickled)
def GETHLLCARDINALITY(self, groupID):
pickled = self.hllR.get(groupID)
return len(pickle.loads(pickled))
def GETTRUECARDINALITY(self, groupID):
return self.actual_hll[groupID]
Get thresholds for groups (between 20 and 80 %)
In [48]:
class testCardinalityErrorRates(unittest.TestCase):
def lessThan10Error(self):
for error in x:
self.assertTrue(error < .1)
In [49]:
rtest = RedisFramework()
In [50]:
rtest.INSERT()
In [51]:
rtest.GETHLLCARDINALITY(10)
Out[51]:
In [52]:
rtest.GETTRUECARDINALITY(10)
Out[52]:
In [2]:
cluster = Cluster()
metadata= cluster.metadata
session= cluster.connect()
In [4]:
keyname = "newkeyspace"
session.execute("CREATE KEYSPACE IF NOT EXISTS "+keyname +
" WITH replication = {'class':'SimpleStrategy', 'replication_factor':1};")
In [36]:
session.set_keyspace(keyname)
mytable = "test"
#session.execute("CREATE TABLE users (id int PRIMARY KEY, location address)")
session.execute(" CREATE TABLE " + mytable+
" (groupID int PRIMARY KEY, userGroups set<uuid>, hll text);")
In [39]:
for key in d:
hll = hyperloglog.HyperLogLog(0.01)
for item in d[key]:
hll.add(item)
pickled = pickle.dumps(hll).encode("hex")
session.execute("""INSERT INTO test (groupID, userGroups, hll)
VALUES (%(groupID)s, %(userGroups)s, %(hll)s)
""",
{'groupID':key, 'userGroups':d[key], 'hll':pickled }
)
In [18]:
# MAKE ALL DATA STRUCTURES
thresholds = [float(random.randint(20, 80))/100 for i in range(100)]
d = {}
from sets import Set
for j in range(100):
uid = uuid.uuid4()
r = random.random()
for i in range(100):
if r > thresholds[i]:
if i in d:
d[i].add(uid)
else:
d[i]= Set([uid])
In [20]:
results = session.execute("SELECT * FROM " +mytable)
for row in results:
if row.groupid == 10:
pick = pickle.loads(row.hll)
len(pick)