In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from IPython.display import Image
import pandas
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def get_pd_files(folder):
sibf = folder + "hosts.csvcapture.pcap.ts.siblingresult.csv"
nonsibf = folder + "hosts.csv__nonsiblings_seed1_n*capture.pcap.ts.siblingresult.csv"
import glob
for filename in glob.glob(nonsibf):
nonsibf = filename
import os.path
if os.path.isfile(sibf) and os.path.isfile(nonsibf):
print("Loading from filenames {} and {}".format(sibf, nonsibf))
else:
print("Files not found {} and {}".format(sibf, nonsibf))
sib = pd.read_csv(sibf, index_col=0)
sib['label'] = 1
nonsib = pd.read_csv(nonsibf, index_col=0)
nonsib['label'] = 0
print("Read {} siblings and {} non-siblings from files.".format(len(sib), len(nonsib)))
return sib, nonsib
def dec2prd_ours(df):
df.loc[df["decision"].str.contains("^sibling"), "dec_prd"] = 1
df.loc[df["decision"].str.contains("^non-sibling"), "dec_prd"] = 0
return # df is changed in place so no returning necessary
def dec2prd_bev(df):
df.loc[df["dec_bev"].str.contains("^sibling"), "dec_bev_prd"] = 1
df.loc[df["dec_bev"].str.contains("^non-sibling"), "dec_bev_prd"] = 0
return # df is changed in place so no returning necessary
def mix_sib_nonsib(sib, nonsib, mode, rs=42):
if mode == "equal":
nonsibint = nonsib.sample(n=len(sib), replace=True, weights=None, random_state=rs)
else:
nonsibint = nonsib
datain = pd.concat([sib,nonsibint])
#print("merged shape: {}".format(datain.shape))
#print("columns: {}".format(list(datain.columns.values)))
return datain
def get_ouralgo_stats(sib, nonsib):
#print("Our algo stats:")
df = mix_sib_nonsib(sib, nonsib, "full", 42)
df_ours = df[["label", "decision"]].copy()
#print(df_ours[df_ours.isnull().any(axis=1)])
dec2prd_ours(df_ours)
undec = len(df_ours[df_ours.isnull().any(axis=1)])
print("Our algo: Not deciding on {} pairs for unknown/error reasons.".format(undec))
df_ours = df_ours.dropna()
print("Our algo stats: ({}) undecided".format(undec))
return stats(df_ours["label"], df_ours["dec_prd"])
def get_bev_stats(sib, nonsib):
#print("Beverly algo stats:")
df = mix_sib_nonsib(sib, nonsib, "full", 42)
#print(df[df.isnull().any(axis=1)])
df_tmp = df[["label", "dec_bev"]].copy()
dec_nan = len(df_tmp[df_tmp["dec_bev"].isnull() == True])
df_tmp = df_tmp[df_tmp["dec_bev"].isnull() == False]
dec2prd_bev(df_tmp)
undec = len(df_tmp[df_tmp.isnull().any(axis=1)])
df_tmp = df_tmp.dropna()
print("Beverly algo: Not deciding on {} pairs for NaN and {} pairs for unknown/error reasons.".format(dec_nan, undec))
print("Beverly algo stats: ({}) undecided".format(undec))
return stats(df_tmp["label"], df_tmp["dec_bev_prd"])
def match_nonsibs_slow(sib, nonsib, rs=42):
ctr = 0
for i, ii in sib.iterrows():
for j, jj in sib.iterrows():
if ii[1] != jj[1]:
nscand = ii[1] + "_+_" + jj[1]
#print(nscand)
ctr += 1
#if not (nonsib["domain"] == nscand).any():
# print("fail for {}".format(nscand))
# return
#print(ctr)
return
def match_nonsibs(sib, nonsib, rs=42):
ctr = 0
a = []
sd = dict() # siblings dict
nsd = dict() # non siblings dict
for i in sib.itertuples():
sd[i[0]] = 0
for i in nonsib.itertuples():
nsd[i[0]] = 0
nscand = dict()
#nstmp = pandas.DataFrame()
for i in sd.keys():
for j in sd.keys():
if i != j:
nscandstr = i + "_+_" + j
nscand[nscandstr] = 1
print("Generated {} non-sibling candidates from {} siblings.".format(len(nscand), len(sib)))
fails = []
for k1 in nsd.keys():
if k1 not in nscand.keys():
fails.append(k1)
#print("fail! {} ".format(i))
nsfiltered = nonsib.copy()
nsfiltered.drop(fails, inplace=True)
return nsfiltered
def assign_groups_old(datain):
datain["group"] = "servers"
datain.loc[datain["domain"].str.contains("nlnog.net"), "group"] = "nlnog"
datain.loc[datain["domain"].str.contains("RA_"), "group"] = "RA"
datain.loc[datain["domain"].str.extract("RA_([0-9]{4})") < 6019, "group"] = "RAv1"
datain.loc[datain["domain"].str.extract("RA_([0-9]{4})") > 6018, "group"] = "RAv2"
return
#groups = datain["group"].as_matrix()
#del datain["group"]
#return groups
def assign_groups(datain):
datain["group"] = "servers"
#sib.loc[sib.index.str.contains("nlnog.net"), "group"] = "nlnog"
datain.loc[datain.index.str.contains("nlnog.net"), "group"] = "nlnog"
datain.loc[datain.index.str.contains("RA_"), "group"] = "RA"
datain["ra_id"] = datain.index.str.extract("RA_([0-9]{4})", expand=False).astype(float).fillna(0).astype(int)
#datain.index.str.extract("RA_([0-9]{4})", expand=False).astype(float).fillna(0).astype(int) > 6018
datain.loc[(datain.ra_id > 5999) & (datain.ra_id < 6019), "group"] = "RAv1"
datain.loc[datain.ra_id > 6018, "group"] = "RAv2"
#datain.loc[datain.index.str.extract("RA_([0-9]{4})", expand=False) > 6018, "group"] = "RAv2"
groups = datain["group"].as_matrix()
return groups
def prune_datain(datain):
errorc = len(datain[datain["decision"].str.contains("ERROR|error") == True])
print("Removing {} errors values from datain.".format(errorc))
datain = datain[datain["decision"].str.contains("ERROR|error") == False]
hzdiffc = len(datain[datain["hzdiff"] != 0])
print("Deciding {} hzdiff hosts as non-sib, stats:".format(hzdiffc))
lbl = datain[datain["hzdiff"] != 0]["label"]
prd = lbl.copy()
prd[:] = 0
dataout = datain[datain["hzdiff"] == 0]
# datain = datain[datain["domain"].str.contains("nlnog.net") == True]
return dataout, lbl, prd
def prune_data_for_ml(datain):
# just kick hzdiff out
# problem: NaNs might be in non-feature columns such as RA_ID
erridx = datain[datain.decision.str.contains("ERROR|error") == True].index
labels, features = make_labels_features(datain)
naidx = datain[features.isnull().any(axis=1) == True].index
bothidx = erridx | naidx
dataout = datain.drop(bothidx)
#dataout = dataout[dataout.decision.str.contains("ERROR|error") == False]
# TODO: should also calculcate stats on this!
lbl = datain.loc[bothidx, "label"]
prd = lbl.copy()
prd[:] = 0
stats(lbl,prd)
print("Removing {} rows with error results and {} rows with NaNs (typically hz different) from a \
total of {} entries, resulting in {} entries.".format(
len(erridx), len(naidx), len(datain), len(dataout)))
return dataout, lbl, prd
def stats(lbl, prd):
tp = np.sum((lbl == 1) & (prd == 1))
fp = np.sum(lbl < prd )
tn = np.sum((lbl == 0) & (prd == 0))
fn = np.sum(lbl > prd )
try:
prec = round(100*tp/(tp+fp),2) # TPR?
recall = round(100*tp/(tp+fn),2)
spec= round(100*tn/(tn+fp),2) # TNR?
acc = round(100*(tn+tp)/(tn+fn+fp+tp),2)
except ZeroDivisionError as e:
print("Catching ZeroDivisionError at stats!")
prec = 0
recall = 0
spec = 0
acc = 0
print("Correct: {}, incorrect {}, TP {}, FP {}, TN {}, FN{}, Prec. {}, Rec. {}, Spec. {}, Acc. {}%".format(
np.sum(lbl == prd),
np.sum(lbl != prd),
tp, fp, tn, fn,
prec, recall, spec, acc
))
return prec, recall, spec, acc
def make_labels_features(dfin):
labels = dfin["label"]
features = dfin[["hzdiff", "hzr2diff", "timestamps_diff", "adiff",
"theta", "r2diff", "ott_rng_diff_rel", "optsdiff",
"perc_85_val"]].copy()
features["hzr2mean"] = (dfin["hz4r2"] + dfin["hz6r2"]) / 2.0
features["r2mean"] = (dfin["r4_sqr"] + dfin["r6_sqr"]) / 2.0
features["ott_rng_mean"] = (dfin["ott4_rng"] + dfin["ott6_rng"]) / 2.0
features["splinediff_scaled"] = dfin["perc_85_val"] / features["ott_rng_mean"]
return labels, features
def get_sample_weight(sib, nonsib):
# WIP TODO
#siblings = len(dfin[dfin["label"] == 1])
#nonsiblings = len(datain[datain["label"] == 0])
sl = len(sib)
nsl = len(nonsib)
tl = sl + nsl
nsw = sl / tl
sw = nsl / tl
print("Found {} sibs and {} nonsibs, weights: {} and {}".format(sl, nsl, sw, nsw))
weight = np.zeros(len(datain))
weight = np.float32(datain["label"].as_matrix())
weight[weight == 1] = sw
weight[weight == 0] = nsw
def get_sample_weight_one_input(dfin):
sl = len(dfin[dfin["label"] == 1])
nsl = len(dfin[dfin["label"] == 0])
tl = sl + nsl
nsw = sl / tl
sw = nsl / tl
weight = np.zeros(len(dfin))
weight = np.float32(dfin["label"].as_matrix())
weight[weight == 1] = sw
weight[weight == 0] = nsw
print("Found {} sibs and {} nonsibs, weights: {} and {}, #weights: {}".format(
sl, nsl, round(sw,4), round(nsw,4), len(weight)))
return weight
# functions for ML with proprtional group sampling
def split_stratified_groups(sib, splits, nr):
from sklearn.model_selection import KFold # non-overlapping!
groups = assign_groups(sib)
groupset = set(groups)
gsibdf_train = pd.DataFrame(columns=sib.columns)
gsibdf_test = pd.DataFrame(columns=sib.columns)
for i in groupset:
groupsib = sib[sib["group"] == i].copy()
if len(groupsib ) < splits:
# can not split into more folds than files...
print("ERROR: more splits ({}) than samples ({}), reducing to sample nr".format(splits, len(groupsib)))
splits = len(groupsib)
#print("## GROUP: {} with {} elements.".format(i, len(groupsib)))
ks = KFold(n_splits=splits, random_state=42, shuffle=True)
labels, features = make_labels_features(groupsib)
ctr = -1
for train_index, test_index in ks.split(groupsib):
ctr += 1
if (ctr == nr):
#print("TRAIN:", train_index, "TEST:", test_index)
gsibdf_train = gsibdf_train.append(groupsib.iloc[train_index])
gsibdf_test = gsibdf_test.append(groupsib.iloc[test_index])
break
return [gsibdf_train, gsibdf_test]
def dt_train(labels, features, weight, rs=42):
estimator = DecisionTreeClassifier(max_depth=30, min_samples_leaf=5, random_state=42)
est = estimator.fit(features, labels, sample_weight=weight)
return est
def kfold_train_test(sib, nonsib):
kfolds = 10
stats_train_error = np.empty((10,4), dtype=float)
stats_test_error = np.empty((10,4), dtype=float)
graphs = []
for i in range(10):
print("Round {}".format(i))
# pick proportionally from each group
train_sib, test_sib = split_stratified_groups(sib, 10, i)
# create, select, and mix matching nonsibs
train_nonsib = match_nonsibs(train_sib, nonsib)
test_nonsib = match_nonsibs(test_sib, nonsib)
train = mix_sib_nonsib(train_sib,train_nonsib, "all")
# prune NaNs out
train, train_prune_lbl, train_prune_prd = prune_data_for_ml(train)
test = mix_sib_nonsib(test_sib,test_nonsib, "all")
test, test_prune_lbl, test_prune_prd = prune_data_for_ml(test)
# split out features, labels, and weights
train_lbl, train_ftr = make_labels_features(train)
test_lbl, test_ftr = make_labels_features(test)
train_weight = get_sample_weight_one_input(train)
test_weight = get_sample_weight_one_input(test)
# train estimator
est = dt_train(train_lbl, train_ftr, train_weight)
stats_train_error[i] = stats(train_lbl, est.predict(train_ftr))
stats_test_error[i] = stats(test_lbl, est.predict(test_ftr))
graph = dt_plot(est, train_ftr)
graphs.append(graph)
#Image(graph.create_png())
return stats_train_error, stats_test_error, graphs
In [138]:
def get_ls_files(folder):
ls = pandas.DataFrame()
import glob
ctr=0
for filename in glob.glob(folder+"ls_candidates.csv*siblings-filtered.uniqpairs.ips.*sibling-measurement-*.pcap.ts.siblingresult.csv"):
ctr += 1
#print(filename)
ls = ls.append(pd.read_csv(filename, index_col=0, usecols=["domain", "ip4", "ip6", "hz6r2", "hz4r2", "optsdiff", "hzdiff", "timestamps_diff", "dec_bev", "decision"]))
print("Read {} decisions from {} files.".format(len(ls), ctr))
return ls
def decision_ml1(row):
tsd_thresh = 0.2557 # learned from ML DT
if row.decision == "ERROR: too small clock hertz r-squares":
return row.decision
if row.optsdiff > 0:
return "non-sibling(optsdiff)"
elif row.hzdiff > 0:
return "non-sibling(hzdiff)"
elif row.timestamps_diff <= tsd_thresh:
return "sibling(tsdiff)"
elif row.timestamps_diff > tsd_thresh:
return "non-sibling(tsdiff)"
elif "ERROR" in row.decision or "error in row.decision":
return row.decision
else:
return "unknown!"
def fix_dec_bev(row):
if isinstance(row.dec_bev, str):
return row.dec_bev
if np.isnan(row.dec_bev):
if "ERROR" in row.decision or "error" in row.decision:
return row.decision
else:
return dec_bev
In [4]:
ls = get_ls_files("/srv/scheitle-2016-siblings/ls/")
In [85]:
ls.dec_ml1 = ls.apply(lambda row: decision_ml1 (row),axis=1)
In [77]:
ls.dec_bev_fixed = ls.apply(lambda row: fix_dec_bev (row),axis=1)
In [78]:
from collections import Counter
Counter(ls.dec_ml1.as_matrix())
Out[78]:
In [79]:
Counter(ls.decision.as_matrix())
Out[79]:
In [80]:
Counter(ls.dec_bev_fixed.as_matrix())
Out[80]:
In [93]:
ls["ip4ip6"] = ls.ip4+ls.ip6
lscp = ls.drop_duplicates(subset=["ip4ip6"])
lscp
In [139]:
lscp.dec_ml1 = lscp.apply(lambda row: decision_ml1 (row),axis=1)
In [103]:
lscp.dec_bev_fixed = lscp.apply(lambda row: fix_dec_bev (row),axis=1)
In [ ]:
In [118]:
Counter(lscp.decision.as_matrix())
Out[118]:
In [141]:
sib = np.count_nonzero(lscp.decision.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.decision.str.contains("^non-sibling") == True)
c = len(lscp)
print("our algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))
In [142]:
sib = np.count_nonzero(lscp.dec_bev_fixed.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.dec_bev_fixed.str.contains("^non-sibling") == True)
c = len(lscp)
print("bev algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))
In [143]:
sib = np.count_nonzero(lscp.dec_ml1.str.contains("^sibling") == True)
nonsib = np.count_nonzero(lscp.dec_ml1.str.contains("^non-sibling") == True)
c = len(lscp)
print("ml1 algo: {} decision, {} sibling, {} non-sibling, {} unknown".format(c, sib, nonsib, c-sib-nonsib))
In [125]:
In [135]:
Counter(lscp.dec_bev_fixed.as_matrix())
Out[135]:
In [140]:
Counter(lscp.dec_ml1.as_matrix())
Out[140]:
In [137]:
Counter(lscp.decision.as_matrix())
Out[137]:
In [184]:
# ht + ml1 intersection
a = (lscp.decision.str.contains("^sibling") & lscp.dec_ml1.str.contains("^sibling") == True)
#sib = np.count_nonzero(lscp.decision.str.contains("^sibling") == True)
#np.count_nonzero(a.as_matrix())
Counter(a.as_matrix())
Out[184]:
In [168]:
b = a.as_matrix()
In [169]:
b[1]
Out[169]:
In [175]:
del np