In [ ]:
# Helper functions
import re
type_lookup_table = {u'ADD_GROUP': 4,
u'ADD_USER': 12,
u'ANOM_ABEND': 0,
u'CONFIG_CHANGE': 24,
u'CRED_ACQ': 20,
u'CRED_DISP': 13,
u'CRED_REFR': 17,
u'CRYPTO_KEY_USER': 6,
u'CRYPTO_SESSION': 14,
u'DAEMON_END': 8,
u'DAEMON_START': 7,
u'LOGIN': 19,
u'NETFILTER_CFG': 22,
u'SYSCALL': 5,
u'SYSTEM_RUNLEVEL': 1,
u'SYSTEM_SHUTDOWN': 18,
u'USER_ACCT': 9,
u'USER_AUTH': 10,
u'USER_CHAUTHTOK': 21,
u'USER_CMD': 3,
u'USER_END': 23,
u'USER_ERR': 11,
u'USER_LOGIN': 2,
u'USER_LOGOUT': 15,
u'USER_START': 16}
def get_data(line, window_size=10, start_time=1422496861):
timestamp = float(re.search('audit\(([0-9]+.[0-9]+)', line).group(1))
type_code = type_lookup_table[re.search('type=([A-Z_]+) ', line).group(1)]
window = int((timestamp -start_time)/window_size)
return (window, type_code)
from collections import defaultdict
def get_longest_sets_possible(input_sets):
def is_subset(main_set, item):
is_subset = False
for main_item in main_set:
if item.issubset(main_item):
is_subset = True
return is_subset
input_dict = defaultdict(set)
for i in input_sets:
input_dict[len(i)].add(i)
output_sets = set()
lengths = sorted(input_dict.keys(), reverse=True) # Largest first
for i in input_dict[lengths[0]]: # since they are all the longest length we know that they are good
output_sets.add(i)
for length in lengths[1:]:
for item in input_dict[length]:
if not is_subset(output_sets, item):
output_sets.add(item)
return output_sets
In [ ]:
# Load Data
#logs = sc.textFile("hdfs:///user/ytesfaye/lab41_logs_small.log.gz").repartition(10)
#transactions = logs.map(get_data) \
# .groupByKey() \
# .map(lambda (key, iterator): list(set(iterator)))
In [ ]:
tbird_logs = sc.textFile("hdfs://namenode/magichour/tbird.log.preProc.stringmatch").repartition(10)
In [ ]:
tbird_logs.count()
In [ ]:
tbird_logs.take(2)
In [ ]:
# Load Data
#tbird_logs = sc.textFile("hdfs://namenode/user/ytesfaye/tbird.log.out.logCluster.processed.gz").repartition(10)
tbird_logs = sc.textFile("hdfs://namenode/magichour/tbird.log.preProc.stringmatch").repartition(10)
#transactionsNoFreqs = sc.textFile("hdfs://namenode/magichour/tbird.log.out.logCluster.processed.10sec.50000freqnew")
def get_tbird_data(line, window_size=10):
ls = line.split(',')
timestamp = float(ls[0])
type_code = int(ls[1])
window = int(timestamp/window_size)
#ls = line.split(' ')
#return ls
return (window, type_code)
transactions = tbird_logs.map(get_tbird_data) \
.groupByKey() \
.map(lambda (key, iterator): list(set([item for item in iterator if item != -1]))) \
.filter(lambda x: len(x) >= 2)
#transactions = transactionsNoFreqs.map(get_tbird_data)
# Load lookup table so that we can get back to raw strings
template_lookup = {}
for line in sc.textFile("hdfs://namenode/user/ytesfaye/tmp.txt").collect():
ls = line.split(',', 2)
template_lookup[int(ls[0])] = ls[1]
dimension = max(template_lookup.keys()) + 1
In [ ]:
from pyspark.mllib.linalg import Vectors, SparseVector
import numpy as np
from scipy.sparse import lil_matrix
def make_vector(input_list, dimension=dimension):
input_list, key = input_list
return [key, SparseVector(dimension, sorted(input_list), np.ones(len(input_list)))]
In [ ]:
vectorized_transactions = transactions.filter(lambda x: len(x) >= 2).zipWithUniqueId().map(make_vector)
In [ ]:
from pyspark.mllib.clustering import LDA
model = LDA.train(vectorized_transactions, k=5, seed=1)
In [ ]:
topics = model.topicsMatrix()
In [ ]:
max_topics = 20
num_words_per_topic = 5
for topic_num, (ids, weights) in enumerate(model.describeTopics(num_words_per_topic)):
print 'Topic %d'%topic_num
print '---------------------'
for i, n in enumerate(ids):
print '%4d (%2.2f): %s'%(n, weights[i]*100.0, template_lookup[n])
In [ ]:
from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(transactions, minSupport=0.01, numPartitions=10)
result = model.freqItemsets().collect()
In [ ]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
In [ ]:
len(pruned_items)
In [ ]:
for item in pruned_items:
print ''.join([' ' + str(i) + ' ' for i in sorted(item, key=int)])
#for i in range(len(pruned_items)):
#print '---------------------'
#for template in pruned_items[i]:
#print '%4d'%template, template_lookup[template]
In [ ]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
for item in pruned_items:
print '|'.join([',' + str(i) + ',' for i in sorted(item, key=int)])
In [ ]: