In [1]:
import logging
logger = logging.getLogger("SR")
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(ch)
logger.setLevel(logging.DEBUG)
In [2]:
import itertools as it
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from tldextract import TLDExtract
from binlog_reader import binlog_reader
import numpy as np
class GroupActivity(BaseEstimator, TransformerMixin):
def _similarity(self, ips_1, ips_2):
a, b = float(len(ips_1)), float(len(ips_2))
c = float(len(ips_1 & ips_2))
return 0.5 * (c/a + c/b), c / len(ips_1 | ips_2), int(c)
def fit(self, x, y=None):
return self
def transform(self, d2ip_idx_list):
logger.debug("Start calc group activity")
all_domains = list(it.chain.from_iterable([d2ip.keys() for d2ip in d2ip_idx_list]))
result = {domain: dict() for domain in all_domains}
pairs = list(filter(lambda (x, y): x < y, it.permutations(range(len(d2ip_idx_list)), 2)))
ptn = "({}, {})_{}"
for idx, domain in enumerate(all_domains):
for l ,r in pairs:
left, right = d2ip_idx_list[l].get(domain), d2ip_idx_list[r].get(domain)
sim, jacc, ln = 0., 0., 0.
if left and right:
sim, jacc, ln = self._similarity(left, right)
result[domain].update({ptn.format(l, r, "sim"): sim,
ptn.format(l, r, "jcd"): jacc,
ptn.format(l, r, "length"): ln})
if (idx + 1) % 100000 == 0:
logger.debug("Processed %d domains", idx + 1)
return result
class IndexCreator(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, queries_all):
result = []
for idx, queries in enumerate(queries_all):
logger.debug("Creating index, part %d/%d", idx + 1, len(queries_all))
ip2d, d2ip = dict(), dict()
for ip, domain in queries:
ip2d.setdefault(ip, set())
ip2d[ip].add(domain)
d2ip.setdefault(domain, set())
d2ip[domain].add(ip)
result.append({"ip2d": ip2d, "d2ip": d2ip})
return result
class IndexMerger(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, indexes):
ip2d_full, d2ip_full = dict(), dict()
for curr in indexes:
for domain in curr['d2ip']:
d2ip_full.setdefault(domain, set())
d2ip_full[domain] |= curr['d2ip'][domain]
for ip in curr["ip2d"]:
ip2d_full.setdefault(ip, set())
ip2d_full[ip] |= curr['ip2d'][ip]
return ip2d_full, d2ip_full
class PairExtractor(BaseEstimator, TransformerMixin):
def __init__(self):
self._extract = TLDExtract(include_psl_private_domains=True)
def sanitize(self, domain):
return self._extract(domain).registered_domain
def fit(self, x, y=None):
return self
def transform(self, queries_all):
result = []
for idx, queries in enumerate(queries_all):
logger.debug("Transforming domains, part %d/%d", idx + 1, len(queries_all))
current = [(ip, self.sanitize(domain)) for ip, domain in queries]
current = filter(lambda (ip, domain): domain, current)
result.append(set(current))
return result
class BLReader(BaseEstimator, TransformerMixin):
def __init__(self, fields):
self.fields = fields
def fit(self, x, y=None):
return self
def transform(self, fnames):
result = []
for idx, fn in enumerate(fnames):
with open(fn, 'rb') as infile:
logger.debug("Read file %s, part %d/%d", fn, idx + 1, len(fnames))
reader = binlog_reader(infile, self.fields)
current = [tuple([query[fld] for fld in self.fields]) for query in reader]
result.append(set(current))
return result
class PosNegDomainReader(BaseEstimator, TransformerMixin):
def __init__(self):
self._extract = TLDExtract(include_psl_private_domains=True)
def sanitize(self, domain):
return self._extract(domain).registered_domain
def fit(self, x, y=None):
return self
def transform(self, fnames):
if len(fnames) != 2:
raise RuntimeError("Must be 2 files: positive and negative")
result = []
for fn in fnames:
with open(fn, 'rb') as infile:
logger.debug("Read file %s", fn)
current = [self.sanitize(line.strip()) for line in infile]
current = filter(lambda domain: domain, current)
result.append(set(current))
return result
class FormXy(BaseEstimator, TransformerMixin):
def __init__(self, all_domains):
self.all_domains = set(all_domains)
def fit(self, x, y=None):
return self
def transform(self, lst):
if len(lst) != 2:
raise RuntimeError("Must be list with 2 sets: positive and negative")
pos, neg = lst
pos, neg = set(pos), set(neg)
inter = pos & neg
logger.debug("Positive size %d, Negative size %d, Intersection %d", len(pos), len(neg), len(inter))
pos, neg = (pos - inter) & self.all_domains, (neg - inter) & self.all_domains
logger.debug("Positive after %d, Negative after %d", len(pos), len(neg))
X = list(pos) + list(neg)
y = [1 for _ in range(len(pos))] + [-1 for _ in range(len(neg))]
return np.array(X), np.array(y)
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, indexes_all):
return [idx[self.key] for idx in indexes_all]
class SecureRank(ClassifierMixin):
def __init__(self, ip2d, d2ip, max_iter=20, init_abs_score=10.):
self.ip2d = ip2d
self.d2ip = d2ip
self.max_iter = max_iter
self.default_score = init_abs_score
self.f_ord = ['sc_score', 'black_score', 'white_score']
def fit(self, domain, d_class):
logger.debug("Initialize scores")
self.rank_ip = {ip: {'sc_score': 0., 'black_score': 0., 'white_score': 0.} for ip in self.ip2d}
self.rank_d = {d: {'sc_score': 0., 'black_score': 0., 'white_score': 0.} for d in self.d2ip}
for dom, cls in zip(domain, d_class):
if cls == 1:
self.rank_d[dom]['sc_score'] = -float(self.default_score)
self.rank_d[dom]['black_score'] = -float(self.default_score)
elif cls == -1:
self.rank_d[dom]['sc_score'] = float(self.default_score)
self.rank_d[dom]['white_score'] = float(self.default_score)
for it in range(self.max_iter):
logger.debug("Iteration %d", it + 1)
for ip in self.rank_ip:
self.rank_ip[ip]['sc_score'] = sum(self.rank_d[d]['sc_score']/len(self.d2ip[d])
for d in self.ip2d[ip])
self.rank_ip[ip]['black_score'] = sum(self.rank_d[d]['black_score']/len(self.d2ip[d])
for d in self.ip2d[ip])
self.rank_ip[ip]['white_score'] = sum(self.rank_d[d]['white_score']/len(self.d2ip[d])
for d in self.ip2d[ip])
for domain in self.rank_d:
self.rank_d[domain]['sc_score'] = sum(self.rank_ip[ip]['sc_score']/len(self.ip2d[ip])
for ip in self.d2ip[domain])
self.rank_d[domain]['black_score'] = sum(self.rank_ip[ip]['black_score']/len(self.ip2d[ip])
for ip in self.d2ip[domain])
self.rank_d[domain]['white_score'] = sum(self.rank_ip[ip]['white_score']/len(self.ip2d[ip])
for ip in self.d2ip[domain])
return self
def transform(self, domains):
return [[self.rank_d[dom][f] for f in self.f_ord] for dom in domains]
class IndentityOp(BaseEstimator, TransformerMixin, ClassifierMixin):
def __init__(self, info):
self.info = info
self.f_ord = self.info[self.info.keys()[0]].keys()
def fit(self, X, y):
return self
def transform(self, X):
return [[self.info[x][f] for f in self.f_ord] for x in X]
def perf_measure(y_pred, y_test):
tp, tn, fp, fn = 0, 0, 0, 0
for idx, (real, pred) in enumerate(zip(y_test, y_pred)):
if real == pred:
if real == 1:
tp += 1
else:
tn += 1
else:
if real == 1:
fp += 1
else:
fn += 1
return(tp, tn, fp, fn)
In [3]:
import itertools as it
from sklearn.pipeline import Pipeline, FeatureUnion
"""
fnames = [
"/home/ivan/dns-03-aprl/querylog.bin.33-1459677601",
"/home/ivan/dns-03-aprl/querylog.bin.33-1459681201",
"/home/ivan/dns-03-aprl/querylog.bin.33-1459684801",
]
apriori_fnames = [
"/home/ivan/3_aprl_blacklist",
"/home/ivan/dns-whitelist/top-1m",
]
"""
fnames = [
"/home/ivan/dns-13-aprl/querylog.bin.33-1460530801",
"/home/ivan/dns-13-aprl/querylog.bin.33-1460534401",
"/home/ivan/dns-13-aprl/querylog.bin.33-1460538001",
"/home/ivan/dns-13-aprl/querylog.bin.33-1460541601",
"/home/ivan/dns-13-aprl/querylog.bin.33-1460545201",
"/home/ivan/dns-13-aprl/querylog.bin.33-1460548801",
]
apriori_fnames = [
"/home/ivan/13_aprl_blacklist",
"/home/ivan/dns-whitelist/top-1m",
]
preprocessing = Pipeline([
('raw_pairs', BLReader(fields=["client_ip", "dname"])),
('clean_pairs', PairExtractor()),
('indexes', IndexCreator())
])
group_activity = Pipeline([
("selector", ItemSelector(key="d2ip")),
("grp_act", GroupActivity())
])
indexes = preprocessing.transform(fnames)
ip2d_full, d2ip_full = IndexMerger().transform(indexes)
group_act = group_activity.transform(indexes)
In [4]:
read_aprior_info = Pipeline([
("reader", PosNegDomainReader()),
("form_sets", FormXy(group_act.keys()))
])
X, y = read_aprior_info.transform(apriori_fnames)
In [5]:
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, precision_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
union = FeatureUnion(
transformer_list=[
('ranking', SecureRank(ip2d_full, d2ip_full, max_iter=20)),
('group_activities', IndentityOp(group_act))
])
clf = Pipeline([
('preparation', union),
('final_score', AdaBoostClassifier(random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf.fit(X_train, y_train)
Out[5]:
In [6]:
y_pred_roc = clf.predict_proba(X_test)[:,1]
y_pred = clf.predict(X_test)
In [14]:
clf.fit(X, y)
Out[14]:
In [15]:
X_graph = d2ip_full.keys()
y_graph_prob = clf.predict_proba(X_graph)[:, np.where(clf.classes_ == 1)[0][0]]
y_graph_labels = clf.predict(X_graph)
In [17]:
result = sorted(zip(X_graph, y_graph_prob, y_graph_labels), key=lambda x: x[1], reverse=True)
with open('/home/ivan/13-aprl-lastresult.txt', 'w') as outfile:
for domain, prob, lab in result:
outfile.write("{}\t{}\t{}\n".format(domain, prob, lab))
In [ ]:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 15, 15
In [ ]:
y_new = clf.predict_proba(X_test)
plt.hist(y_new[:,1][np.where(y_test > 0)], bins=30, color='r', alpha=0.5, normed=True)
plt.hist(y_new[:,1][np.where(y_test < 0)], bins=30, color='b', alpha=0.5, normed=True)
plt.show()
In [7]:
from sklearn.metrics import roc_auc_score, precision_score, classification_report
print("ROC-AUC {}".format(roc_auc_score(y_test, y_pred)))
print("PRECISION {}".format(precision_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
TP, TN, FP, FN = perf_measure(y_pred, y_test)
print("TP:{} | FP: {}| TN: {}| FN: {}".format(TP, FP, TN, FN))