In [ ]:
logs = sc.textFile("hdfs:///user/ytesfaye/lab41_logs_small.log.gz").repartition(10)
In [ ]:
logs.count()
In [ ]:
import re
type_lookup_table = {u'ADD_GROUP': 4,
u'ADD_USER': 12,
u'ANOM_ABEND': 0,
u'CONFIG_CHANGE': 24,
u'CRED_ACQ': 20,
u'CRED_DISP': 13,
u'CRED_REFR': 17,
u'CRYPTO_KEY_USER': 6,
u'CRYPTO_SESSION': 14,
u'DAEMON_END': 8,
u'DAEMON_START': 7,
u'LOGIN': 19,
u'NETFILTER_CFG': 22,
u'SYSCALL': 5,
u'SYSTEM_RUNLEVEL': 1,
u'SYSTEM_SHUTDOWN': 18,
u'USER_ACCT': 9,
u'USER_AUTH': 10,
u'USER_CHAUTHTOK': 21,
u'USER_CMD': 3,
u'USER_END': 23,
u'USER_ERR': 11,
u'USER_LOGIN': 2,
u'USER_LOGOUT': 15,
u'USER_START': 16}
def get_data(line, window_size=10, start_time=1422496861):
timestamp = float(re.search('audit\(([0-9]+.[0-9]+)', line).group(1))
type_code = type_lookup_table[re.search('type=([A-Z_]+) ', line).group(1)]
window = int((timestamp -start_time)/window_size)
return (window, type_code)
from collections import defaultdict
def get_longest_sets_possible(input_sets):
def is_subset(main_set, item):
is_subset = False
for main_item in main_set:
if item.issubset(main_item):
is_subset = True
return is_subset
input_dict = defaultdict(set)
for i in input_sets:
input_dict[len(i)].add(i)
output_sets = set()
lengths = sorted(input_dict.keys(), reverse=True) # Largest first
for i in input_dict[lengths[0]]: # since they are all the longest length we know that they are good
output_sets.add(i)
for length in lengths[1:]:
for item in input_dict[length]:
if not is_subset(output_sets, item):
output_sets.add(item)
return output_sets
In [ ]:
transactions = logs.map(get_data) \
.groupByKey() \
.map(lambda (key, iterator): list(set(iterator)))
In [ ]:
tbird_logs = sc.textFile("hdfs:///user/ytesfaye/tbird.log.out.logCluster.processed.gz").repartition(10)
def get_tbird_data(line, window_size=10, start_time=1131523501):
ls = line.split(',')
timestamp = float(ls[0])
type_code = int(ls[1])
window = int((timestamp -start_time)/window_size)
return (window, type_code)
transactions = tbird_logs.map(get_tbird_data) \
.filter(lambda (x, y): y != -1) \
.groupByKey() \
.map(lambda (key, iterator): list(set(iterator)))
In [ ]:
from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model.freqItemsets().collect()
In [ ]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
for item in pruned_items:
print '|'.join([',' + str(i) + ',' for i in sorted(item, key=int)])
In [ ]:
import itertools
combo_counts = transactions.flatMap(lambda l: list([((i1,i2),1) for i1, i2 in itertools.combinations(l, 2)])).countByKey()
individual_counts = transactions.flatMap(lambda l: l).countByValue()
num_transactions = transactions.count()
In [ ]:
min_support = .2
min_confidence = .95
In [ ]:
In [ ]:
# Compute counts for each pair
combo_counts = transactions.flatMap(lambda l: list([((i1,i2),1) for i1, i2 in itertools.combinations(l, 2)])).countByKey()
In [ ]:
# Find Combinations exceeding min support
pairs = []
combo_counts_dict = dict(combo_counts) # So we don't add items we access
for i1, i2 in combo_counts_dict:
support = get_pair_support(i1, i2)
confidence = get_confidence(i1,i2)
if support > min_support and confidence > min_confidence:
pairs.append((i1,i2))
#print support, get_confidence(i1, i2), d, i1, i2
In [ ]:
pairs
In [ ]:
def get_support(item):
return individual_counts[item]/float(num_transactions)
def get_pair_support(item1, item2):
return (combo_counts[(item1, item2)] + combo_counts[(item2, item1)]) /float(num_transactions)
def get_confidence(item1, item2):
return float(get_pair_support(item1, item2))/get_support(item1)
def get_lift(item1, item2):
return float(get_pair_support(item1, item2))/(get_support(item1)*get_support(item2))
In [ ]: