Auditd to transactions


In [ ]:
logs = sc.textFile("hdfs:///user/ytesfaye/lab41_logs_small.log.gz").repartition(10)

In [ ]:
logs.count()

In [ ]:
import re
type_lookup_table = {u'ADD_GROUP': 4,
 u'ADD_USER': 12,
 u'ANOM_ABEND': 0,
 u'CONFIG_CHANGE': 24,
 u'CRED_ACQ': 20,
 u'CRED_DISP': 13,
 u'CRED_REFR': 17,
 u'CRYPTO_KEY_USER': 6,
 u'CRYPTO_SESSION': 14,
 u'DAEMON_END': 8,
 u'DAEMON_START': 7,
 u'LOGIN': 19,
 u'NETFILTER_CFG': 22,
 u'SYSCALL': 5,
 u'SYSTEM_RUNLEVEL': 1,
 u'SYSTEM_SHUTDOWN': 18,
 u'USER_ACCT': 9,
 u'USER_AUTH': 10,
 u'USER_CHAUTHTOK': 21,
 u'USER_CMD': 3,
 u'USER_END': 23,
 u'USER_ERR': 11,
 u'USER_LOGIN': 2,
 u'USER_LOGOUT': 15,
 u'USER_START': 16}
def get_data(line, window_size=10, start_time=1422496861):
    timestamp = float(re.search('audit\(([0-9]+.[0-9]+)', line).group(1))
    type_code = type_lookup_table[re.search('type=([A-Z_]+) ', line).group(1)]
    window = int((timestamp -start_time)/window_size)
    return (window, type_code)
from collections import defaultdict
def get_longest_sets_possible(input_sets):
    def is_subset(main_set, item):
        is_subset = False
        for main_item in main_set:
            if item.issubset(main_item):
                is_subset = True
        return is_subset
    input_dict = defaultdict(set)
    for i in input_sets:
        input_dict[len(i)].add(i)
    
    output_sets = set()
    lengths = sorted(input_dict.keys(), reverse=True) # Largest first
    for i in input_dict[lengths[0]]: # since they are all the longest length we know that they are good
        output_sets.add(i) 
    
    for length in lengths[1:]:
        for item in input_dict[length]:
            if not is_subset(output_sets, item):
                output_sets.add(item)
    return output_sets

In [ ]:
transactions = logs.map(get_data) \
                   .groupByKey() \
                   .map(lambda (key, iterator): list(set(iterator)))

tBird Logs


In [ ]:
tbird_logs = sc.textFile("hdfs:///user/ytesfaye/tbird.log.out.logCluster.processed.gz").repartition(10)
def get_tbird_data(line, window_size=10, start_time=1131523501):
    ls = line.split(',')
    timestamp = float(ls[0])
    type_code = int(ls[1])
    window = int((timestamp -start_time)/window_size)
    return (window, type_code)
transactions = tbird_logs.map(get_tbird_data) \
                   .filter(lambda (x, y): y != -1) \
                   .groupByKey() \
                   .map(lambda (key, iterator): list(set(iterator)))

Using ML Lib


In [ ]:
from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model.freqItemsets().collect()

In [ ]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
for item in pruned_items:
    print '|'.join([',' + str(i) + ',' for i in sorted(item, key=int)])

Manaul Implementation


In [ ]:
import itertools
combo_counts = transactions.flatMap(lambda l: list([((i1,i2),1) for i1, i2 in itertools.combinations(l, 2)])).countByKey()
individual_counts = transactions.flatMap(lambda l: l).countByValue()
num_transactions = transactions.count()

In [ ]:
min_support = .2
min_confidence = .95

In [ ]:


In [ ]:
# Compute counts for each pair
combo_counts = transactions.flatMap(lambda l: list([((i1,i2),1) for i1, i2 in itertools.combinations(l, 2)])).countByKey()

In [ ]:
# Find Combinations exceeding min support
pairs = []
combo_counts_dict = dict(combo_counts) # So we don't add items we access
for i1, i2 in combo_counts_dict:
    support = get_pair_support(i1, i2)
    confidence = get_confidence(i1,i2)
    if support > min_support and confidence > min_confidence:
        pairs.append((i1,i2))
        #print support, get_confidence(i1, i2), d, i1, i2

In [ ]:
pairs

In [ ]:
def get_support(item):
    return individual_counts[item]/float(num_transactions)
def get_pair_support(item1, item2):
    return (combo_counts[(item1, item2)] + combo_counts[(item2, item1)]) /float(num_transactions)
def get_confidence(item1, item2):
    return float(get_pair_support(item1, item2))/get_support(item1)
def get_lift(item1, item2):
    return float(get_pair_support(item1, item2))/(get_support(item1)*get_support(item2))

In [ ]: