In [ ]:
from collections import namedtuple
from collections import defaultdict
import re

TransformLine = namedtuple('TransformLine',
                           ['id', 'type', 'NAME', 'transform', 'compiled'])

LogLine = namedtuple('LogLine', ['ts', 'msg',
                                 'processed', 'dictionary', 'supportId'])

In [ ]:
templates = 'hdfs://namenode/magichour/templates'
matchedTemplates = 'hdfs://namenode/magichour/matchedTemplates'

In [ ]:
mT = sc.pickleFile(matchedTemplates)

In [ ]:
curious = 1174;

logLineCurious = mT.filter(lambda line: int(line.templateId) == curious)

In [ ]:
#logLineCurious.count()

In [ ]:
test = logLineCurious.take(10000)

In [ ]:
crazyDict = defaultdict(dict)

In [ ]:
for t in test:
    l = t
    for d in l.dictionary.iteritems():
        if d[0] not in crazyDict:
            crazyDict[d[0]] = defaultdict(int)
        for v in d[1]:
            crazyDict[d[0]][v] +=1

In [ ]:
crazyDict

In [ ]:
logLineCurious.count()

In [ ]: