Evaluation of Large-Scale Sibling Scan Using various algorithms

Versions and contact same as in siblings_ml.ipynb

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from IPython.display import Image  
import pandas
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

def get_pd_files(folder):
    sibf = folder + "hosts.csvcapture.pcap.ts.siblingresult.csv"
    nonsibf = folder + "hosts.csv__nonsiblings_seed1_n*capture.pcap.ts.siblingresult.csv"
    import glob
    for filename in glob.glob(nonsibf):
        nonsibf = filename
    import os.path
    if os.path.isfile(sibf) and os.path.isfile(nonsibf):
        print("Loading from filenames {} and {}".format(sibf, nonsibf))
        print("Files not found {} and {}".format(sibf, nonsibf))
    sib = pd.read_csv(sibf, index_col=0)
    sib['label'] = 1
    nonsib = pd.read_csv(nonsibf, index_col=0)
    nonsib['label'] = 0
    print("Read {} siblings and {} non-siblings from files.".format(len(sib), len(nonsib)))
    return sib, nonsib
def dec2prd_ours(df):
    df.loc[df["decision"].str.contains("^sibling"), "dec_prd"] =  1
    df.loc[df["decision"].str.contains("^non-sibling"), "dec_prd"] =  0
    return  # df is changed in place so no returning necessary

def dec2prd_bev(df):
    df.loc[df["dec_bev"].str.contains("^sibling"), "dec_bev_prd"] =  1
    df.loc[df["dec_bev"].str.contains("^non-sibling"), "dec_bev_prd"] =  0
    return  # df is changed in place so no returning necessary

def mix_sib_nonsib(sib, nonsib, mode, rs=42):
    if mode == "equal":
        nonsibint = nonsib.sample(n=len(sib), replace=True, weights=None, random_state=rs)
        nonsibint = nonsib
    datain = pd.concat([sib,nonsibint])
    #print("merged shape: {}".format(datain.shape))
    #print("columns: {}".format(list(datain.columns.values)))
    return datain

def get_ouralgo_stats(sib, nonsib):
    #print("Our algo stats:")
    df = mix_sib_nonsib(sib, nonsib, "full", 42)
    df_ours = df[["label", "decision"]].copy()
    undec = len(df_ours[df_ours.isnull().any(axis=1)])
    print("Our algo: Not deciding on {} pairs for unknown/error reasons.".format(undec))
    df_ours = df_ours.dropna()
    print("Our algo stats: ({}) undecided".format(undec))
    return stats(df_ours["label"], df_ours["dec_prd"])

def get_bev_stats(sib, nonsib):
    #print("Beverly algo stats:")
    df = mix_sib_nonsib(sib, nonsib, "full", 42)
    df_tmp = df[["label", "dec_bev"]].copy()
    dec_nan = len(df_tmp[df_tmp["dec_bev"].isnull() == True])
    df_tmp = df_tmp[df_tmp["dec_bev"].isnull() == False]
    undec = len(df_tmp[df_tmp.isnull().any(axis=1)])
    df_tmp = df_tmp.dropna()
    print("Beverly algo: Not deciding on {} pairs for NaN and {} pairs for unknown/error reasons.".format(dec_nan, undec))
    print("Beverly algo stats: ({}) undecided".format(undec))
    return stats(df_tmp["label"], df_tmp["dec_bev_prd"])

def match_nonsibs_slow(sib, nonsib, rs=42):
    ctr = 0 
    for i, ii in sib.iterrows():
        for j, jj in sib.iterrows():
            if ii[1] != jj[1]:
                nscand = ii[1] + "_+_" +  jj[1]
                ctr += 1 
                #if not (nonsib["domain"] == nscand).any():
                #   print("fail for {}".format(nscand))
                #  return
def match_nonsibs(sib, nonsib, rs=42):
    ctr = 0 
    a = []
    sd = dict() # siblings dict
    nsd = dict()  # non siblings dict
    for i in sib.itertuples():
        sd[i[0]] = 0
    for i in nonsib.itertuples():
        nsd[i[0]] = 0
    nscand = dict()
    #nstmp = pandas.DataFrame()
    for i in sd.keys():
        for j in sd.keys():
            if i != j:
                nscandstr = i + "_+_" +  j
                nscand[nscandstr] = 1
    print("Generated {} non-sibling candidates from {} siblings.".format(len(nscand), len(sib)))
    fails = []
    for k1 in nsd.keys():
        if k1 not in nscand.keys():
            #print("fail! {} ".format(i))    
    nsfiltered = nonsib.copy()
    nsfiltered.drop(fails, inplace=True)
    return nsfiltered

def assign_groups_old(datain):
    datain["group"] = "servers"
    datain.loc[datain["domain"].str.contains("nlnog.net"), "group"] = "nlnog"
    datain.loc[datain["domain"].str.contains("RA_"), "group"] = "RA"
    datain.loc[datain["domain"].str.extract("RA_([0-9]{4})") < 6019, "group"] = "RAv1"
    datain.loc[datain["domain"].str.extract("RA_([0-9]{4})") > 6018, "group"] = "RAv2"
    #groups = datain["group"].as_matrix()
    #del datain["group"]
    #return groups

def assign_groups(datain):
    datain["group"] = "servers"
    #sib.loc[sib.index.str.contains("nlnog.net"), "group"] = "nlnog"
    datain.loc[datain.index.str.contains("nlnog.net"), "group"] = "nlnog"
    datain.loc[datain.index.str.contains("RA_"), "group"] = "RA"
    datain["ra_id"] = datain.index.str.extract("RA_([0-9]{4})", expand=False).astype(float).fillna(0).astype(int) 
    #datain.index.str.extract("RA_([0-9]{4})", expand=False).astype(float).fillna(0).astype(int) > 6018
    datain.loc[(datain.ra_id > 5999) & (datain.ra_id < 6019), "group"] = "RAv1"
    datain.loc[datain.ra_id > 6018, "group"] = "RAv2"    
    #datain.loc[datain.index.str.extract("RA_([0-9]{4})", expand=False) > 6018, "group"] = "RAv2"
    groups = datain["group"].as_matrix()
    return groups
def prune_datain(datain):
    errorc = len(datain[datain["decision"].str.contains("ERROR|error") == True])
    print("Removing {} errors values from datain.".format(errorc))
    datain = datain[datain["decision"].str.contains("ERROR|error") == False]

    hzdiffc = len(datain[datain["hzdiff"] != 0])
    print("Deciding {} hzdiff hosts as non-sib, stats:".format(hzdiffc))
    lbl = datain[datain["hzdiff"] != 0]["label"]
    prd = lbl.copy()
    prd[:] = 0
    dataout = datain[datain["hzdiff"] == 0]
    #  datain = datain[datain["domain"].str.contains("nlnog.net") == True]
    return dataout, lbl, prd

def prune_data_for_ml(datain):
    # just kick hzdiff out
    # problem: NaNs might be in non-feature columns such as RA_ID
    erridx = datain[datain.decision.str.contains("ERROR|error") == True].index
    labels, features = make_labels_features(datain)
    naidx = datain[features.isnull().any(axis=1) == True].index
    bothidx = erridx | naidx
    dataout = datain.drop(bothidx)
    #dataout = dataout[dataout.decision.str.contains("ERROR|error") == False]
    # TODO: should also calculcate stats on this!
    lbl = datain.loc[bothidx, "label"]
    prd = lbl.copy()
    prd[:] = 0
    print("Removing {} rows with error results and {} rows with NaNs (typically hz different) from a \
    total of {} entries, resulting in {} entries.".format(
            len(erridx), len(naidx), len(datain), len(dataout)))
    return dataout, lbl, prd

def stats(lbl, prd):
        tp = np.sum((lbl == 1) & (prd == 1)) 
        fp = np.sum(lbl < prd ) 
        tn = np.sum((lbl == 0) & (prd == 0)) 
        fn = np.sum(lbl > prd ) 
            prec =  round(100*tp/(tp+fp),2) # TPR?
            recall = round(100*tp/(tp+fn),2) 
            spec= round(100*tn/(tn+fp),2) # TNR?
            acc = round(100*(tn+tp)/(tn+fn+fp+tp),2)
        except ZeroDivisionError as e:
            print("Catching ZeroDivisionError at stats!")
            prec = 0
            recall = 0
            spec = 0
            acc = 0
        print("Correct: {}, incorrect {}, TP {}, FP {}, TN {}, FN{}, Prec. {}, Rec. {}, Spec. {}, Acc. {}%".format(
        np.sum(lbl == prd),
        np.sum(lbl != prd),
        tp, fp, tn, fn, 
        prec, recall, spec, acc
        return prec, recall, spec, acc
def make_labels_features(dfin):
    labels = dfin["label"]
    features = dfin[["hzdiff", "hzr2diff", "timestamps_diff", "adiff", 
                        "theta", "r2diff", "ott_rng_diff_rel", "optsdiff",
    features["hzr2mean"] = (dfin["hz4r2"] + dfin["hz6r2"])  / 2.0
    features["r2mean"] = (dfin["r4_sqr"] + dfin["r6_sqr"]) / 2.0     
    features["ott_rng_mean"] = (dfin["ott4_rng"] + dfin["ott6_rng"]) / 2.0
    features["splinediff_scaled"] = dfin["perc_85_val"] / features["ott_rng_mean"]
    return labels, features   

def get_sample_weight(sib, nonsib):
    # WIP TODO
    #siblings = len(dfin[dfin["label"] == 1])
    #nonsiblings = len(datain[datain["label"] == 0])
    sl = len(sib)
    nsl = len(nonsib)
    tl = sl + nsl
    nsw = sl / tl
    sw = nsl / tl
    print("Found {} sibs and {} nonsibs, weights: {} and {}".format(sl, nsl, sw, nsw))
    weight = np.zeros(len(datain))
    weight = np.float32(datain["label"].as_matrix())
    weight[weight == 1] = sw
    weight[weight == 0] = nsw
def get_sample_weight_one_input(dfin):
    sl = len(dfin[dfin["label"] == 1])
    nsl = len(dfin[dfin["label"] == 0])
    tl = sl + nsl
    nsw = sl / tl
    sw = nsl / tl
    weight = np.zeros(len(dfin))
    weight = np.float32(dfin["label"].as_matrix())
    weight[weight == 1] = sw
    weight[weight == 0] = nsw
    print("Found {} sibs and {} nonsibs, weights: {} and {}, #weights: {}".format(
        sl, nsl, round(sw,4), round(nsw,4), len(weight)))
    return weight

# functions for ML with proprtional group sampling
def split_stratified_groups(sib, splits, nr):
    from sklearn.model_selection import KFold # non-overlapping!
    groups = assign_groups(sib)
    groupset = set(groups)
    gsibdf_train = pd.DataFrame(columns=sib.columns)
    gsibdf_test = pd.DataFrame(columns=sib.columns)
    for i in groupset:
        groupsib = sib[sib["group"] == i].copy()
        if len(groupsib ) < splits:
            # can not split into more folds than files...
            print("ERROR: more splits ({}) than samples ({}), reducing to sample nr".format(splits, len(groupsib)))
            splits = len(groupsib)
        #print("## GROUP: {} with {} elements.".format(i, len(groupsib)))
        ks = KFold(n_splits=splits, random_state=42, shuffle=True)
        labels, features = make_labels_features(groupsib)
        ctr = -1
        for train_index, test_index in ks.split(groupsib):
            ctr += 1                
            if (ctr == nr):
            #print("TRAIN:", train_index, "TEST:", test_index)
                gsibdf_train = gsibdf_train.append(groupsib.iloc[train_index])
                gsibdf_test = gsibdf_test.append(groupsib.iloc[test_index])
    return [gsibdf_train, gsibdf_test]

def dt_train(labels, features, weight, rs=42):
    estimator = DecisionTreeClassifier(max_depth=30, min_samples_leaf=5, random_state=42)
    est = estimator.fit(features, labels, sample_weight=weight)
    return est

def kfold_train_test(sib, nonsib):
    kfolds = 10
    stats_train_error = np.empty((10,4), dtype=float)
    stats_test_error = np.empty((10,4), dtype=float)
    graphs = []
    for i in range(10):
        print("Round {}".format(i))
        # pick proportionally from each group
        train_sib, test_sib = split_stratified_groups(sib, 10, i)
        # create, select, and mix matching nonsibs
        train_nonsib = match_nonsibs(train_sib, nonsib)
        test_nonsib = match_nonsibs(test_sib, nonsib)
        train = mix_sib_nonsib(train_sib,train_nonsib, "all")
        # prune NaNs out
        train, train_prune_lbl, train_prune_prd = prune_data_for_ml(train)
        test = mix_sib_nonsib(test_sib,test_nonsib, "all")
        test, test_prune_lbl, test_prune_prd = prune_data_for_ml(test)
        # split out features, labels, and weights
        train_lbl, train_ftr = make_labels_features(train)
        test_lbl, test_ftr = make_labels_features(test)
        train_weight = get_sample_weight_one_input(train)
        test_weight = get_sample_weight_one_input(test)
        # train estimator
        est = dt_train(train_lbl, train_ftr, train_weight)   
        stats_train_error[i] = stats(train_lbl, est.predict(train_ftr))
        stats_test_error[i]  =  stats(test_lbl, est.predict(test_ftr))
        graph = dt_plot(est, train_ftr)
    return stats_train_error, stats_test_error, graphs

Iterate through large-scale measurement files...

def get_ls_files(folder):
    ls = pandas.DataFrame()
    import glob
    for filename in glob.glob(folder+"ls_candidates.csv*siblings-filtered.uniqpairs.ips.*sibling-measurement-*.pcap.ts.siblingresult.csv"):
        ctr += 1
        ls = ls.append(pd.read_csv(filename, index_col=0, usecols=["domain", "ip4", "ip6", "hz6r2", "hz4r2", "optsdiff", "hzdiff", "timestamps_diff", "dec_bev", "decision"]))
    print("Read {} decisions from {} files.".format(len(ls), ctr))
    return ls

def decision_ml1(row):
    tsd_thresh = 0.2557  # learned from ML DT
    if row.decision == "ERROR: too small clock hertz r-squares":
        return row.decision
    if row.optsdiff > 0:
        return "non-sibling(optsdiff)"
    elif row.hzdiff > 0:
        return "non-sibling(hzdiff)"
    elif row.timestamps_diff <= tsd_thresh:
        return "sibling(tsdiff)"
    elif row.timestamps_diff > tsd_thresh:
        return "non-sibling(tsdiff)"
    elif "ERROR" in row.decision or "error in row.decision":
        return row.decision
        return "unknown!"
def fix_dec_bev(row):
    if isinstance(row.dec_bev, str):
        return row.dec_bev
    if np.isnan(row.dec_bev):
        if "ERROR" in row.decision or "error" in row.decision:
            return row.decision
        return dec_bev

ls = get_ls_files("/srv/scheitle-2016-siblings/ls/")

Read 1891748 decisions from 37 files.

ls.dec_ml1 = ls.apply(lambda row: decision_ml1 (row),axis=1)

ls.dec_bev_fixed = ls.apply(lambda row: fix_dec_bev (row),axis=1)

from collections import Counter

Counter({'ERROR: clock <1hz': 108,
         'ERROR: too small clock hertz r-squares': 201375,
         'non-sibling(hzdiff)': 3855,
         'non-sibling(optsdiff)': 50156,
         'non-sibling(tsdiff)': 733911,
         'sibling(tsdiff)': 902343})

Counter({'ERROR: binEqual calculation failed!': 44,
         'ERROR: clock <1hz': 117,
         'ERROR: den_arr6 empty!': 18042,
         'ERROR: spline calculation failed!': 520,
         'ERROR: too small clock hertz r-squares': 242328,
         'error_percval!': 6,
         'no skew(unknown)': 25153,
         'non-sibling (hz different)': 3888,
         'non-sibling(big rsqr deviation)': 336,
         'non-sibling(one negligible and ott diff delta too large)': 13944,
         'non-sibling(optsdiff)': 9149,
         'non-sibling(spline test)': 1385,
         'non-sibling(spline test)bigrng': 228133,
         'non-sibling(tsdiff)': 698016,
         'sibling(spline test)': 247084,
         'sibling(spline test)bigrng': 62721,
         'sibling(valid slope/small slope diff)': 150634,
         'unknown(spline guard interval)': 190248})

Counter({'ERROR: binEqual calculation failed!': 44,
         'ERROR: clock <1hz': 117,
         'ERROR: spline calculation failed!': 520,
         'ERROR: too small clock hertz r-squares': 242328,
         'error: den_arr6 empty!': 18042,
         'error_percval!': 6,
         'non-sibling (hz different)': 3888,
         'non-sibling(optsdiff)': 9149,
         'sibling(tau)': 1617654})

Convert to candidate pairs

ls["ip4ip6"] = ls.ip4+ls.ip6
lscp = ls.drop_duplicates(subset=["ip4ip6"])

lscp.dec_ml1 = lscp.apply(lambda row: decision_ml1 (row),axis=1)

lscp.dec_bev_fixed = lscp.apply(lambda row: fix_dec_bev (row),axis=1)

Counter({'ERROR: binEqual calculation failed!': 18,
         'ERROR: clock <1hz': 81,
         'ERROR: den_arr6 empty!': 1229,
         'ERROR: spline calculation failed!': 198,
         'ERROR: too small clock hertz r-squares': 143012,
         'error_percval!': 3,
         'no skew(unknown)': 3109,
         'non-sibling (hz different)': 928,
         'non-sibling(big rsqr deviation)': 124,
         'non-sibling(one negligible and ott diff delta too large)': 632,
         'non-sibling(optsdiff)': 3153,
         'non-sibling(spline test)': 406,
         'non-sibling(spline test)bigrng': 67332,
         'non-sibling(tsdiff)': 53074,
         'sibling(spline test)': 19869,
         'sibling(spline test)bigrng': 13403,
         'sibling(valid slope/small slope diff)': 24240,
         'unknown(spline guard interval)': 21183})

sib = np.count_nonzero(lscp.decision.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.decision.str.contains("^non-sibling") == True)
c = len(lscp)
print("our algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))

our algo: 351994 decision, 57512 sibling, 125649 non-sibling, 168833 unknown

sib = np.count_nonzero(lscp.dec_bev_fixed.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.dec_bev_fixed.str.contains("^non-sibling") == True)
c = len(lscp)
print("bev algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))

bev algo: 351994 decision, 203372 sibling, 4081 non-sibling, 144541 unknown

sib = np.count_nonzero(lscp.dec_ml1.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.dec_ml1.str.contains("^non-sibling") == True)
c = len(lscp)
print("ml1 algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))

ml1 algo: 351994 decision, 149215 sibling, 59692 non-sibling, 143087 unknown

our algo: 351994 decision, 57512 sibling, 125649 non-sibling, 168833 unknown

Counter({'ERROR: binEqual calculation failed!': 18,
         'ERROR: clock <1hz': 81,
         'ERROR: spline calculation failed!': 198,
         'ERROR: too small clock hertz r-squares': 143012,
         'error: den_arr6 empty!': 1229,
         'error_percval!': 3,
         'non-sibling (hz different)': 928,
         'non-sibling(optsdiff)': 3153,
         'sibling(tau)': 203372})

Counter({'ERROR: clock <1hz': 75,
         'ERROR: too small clock hertz r-squares': 143012,
         'non-sibling(hzdiff)': 921,
         'non-sibling(optsdiff)': 3172,
         'non-sibling(tsdiff)': 55599,
         'sibling(tsdiff)': 149215})

Counter({'ERROR: binEqual calculation failed!': 18,
         'ERROR: clock <1hz': 81,
         'ERROR: den_arr6 empty!': 1229,
         'ERROR: spline calculation failed!': 198,
         'ERROR: too small clock hertz r-squares': 143012,
         'error_percval!': 3,
         'no skew(unknown)': 3109,
         'non-sibling (hz different)': 928,
         'non-sibling(big rsqr deviation)': 124,
         'non-sibling(one negligible and ott diff delta too large)': 632,
         'non-sibling(optsdiff)': 3153,
         'non-sibling(spline test)': 406,
         'non-sibling(spline test)bigrng': 67332,
         'non-sibling(tsdiff)': 53074,
         'sibling(spline test)': 19869,
         'sibling(spline test)bigrng': 13403,
         'sibling(valid slope/small slope diff)': 24240,
         'unknown(spline guard interval)': 21183})

# ht + ml1 intersection
a = (lscp.decision.str.contains("^sibling") & lscp.dec_ml1.str.contains("^sibling") == True)
#sib = np.count_nonzero(lscp.decision.str.contains("^sibling") == True)

Counter({False: 295222, True: 56772})

b = a.as_matrix()

