In [ ]:
from collections import namedtuple
from collections import defaultdict
import re
TransformLine = namedtuple('TransformLine',
['id', 'type', 'NAME', 'transform', 'compiled'])
LogLine = namedtuple('LogLine', ['ts', 'msg',
'processed', 'dictionary', 'supportId'])
In [ ]:
templates = 'hdfs://namenode/magichour/templates'
matchedTemplates = 'hdfs://namenode/magichour/matchedTemplates'
In [ ]:
mT = sc.pickleFile(matchedTemplates)
In [ ]:
curious = 1174;
logLineCurious = mT.filter(lambda line: int(line.templateId) == curious)
In [ ]:
#logLineCurious.count()
In [ ]:
test = logLineCurious.take(10000)
In [ ]:
crazyDict = defaultdict(dict)
In [ ]:
for t in test:
l = t
for d in l.dictionary.iteritems():
if d[0] not in crazyDict:
crazyDict[d[0]] = defaultdict(int)
for v in d[1]:
crazyDict[d[0]][v] +=1
In [ ]:
crazyDict
In [ ]:
logLineCurious.count()
In [ ]: