In [1]:
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
from IPython.display import HTML
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
features = [
"request_interval", #0
#"ua_change_rate",#1
#"html2image_ratio",#2
#"variance_request_interval",#3
"payload_average",#4
#"error_rate",#5
"request_depth",#6
#"request_depth_std",#7
"session_length",#8
"percentage_cons_requests",#9
]
columns = []
for f in features:
columns.append((f, 'float'))
In [10]:
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[10]:
In [2]:
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')
In [4]:
# Read from file
np.zeros([30000, 2000])
len(np.zeros([30000, 2000]))
# the original file is here: https://github.com/ludost11/justhackdeflect/blob/master/feature_db-files.txt
filename = '../data/feature_db-files.txt'
file = open(filename)
values =[]
for line in file:
splitted_line = line.split(') {')
useful_part = splitted_line[1]
useful_part = useful_part[:-2]
new_split = useful_part.split(', ')
num_list =[]
for b in new_split:
c = b.split(': ')[1]
num_list.append(float(c))
values.append(num_list)
X = np.array(values)
#X = np.array(values)
print X.shape
In [3]:
# Read from Database
import MySQLdb
id_incident = 19
#db = MySQLdb.connect(host = '127.0.0.1', user = 'root', passwd = '7k32uW+C!JMFXTRT', db = 'bothound')
db = MySQLdb.connect(host = '127.0.0.1', user = 'root', passwd = 'mazhur1n', port = 3306, db = 'bothound')
cur = db.cursor(MySQLdb.cursors.DictCursor)
cur.execute("select * from sessions WHERE id_incident = {0}".format(id_incident))
sessions = [dict(elem) for elem in cur.fetchall()]
db.close()
values = []
for s in sessions:
row = []
for f in features:
row.append(s[f])
values.append(row)
X = np.array(values)
print X.shape
In [ ]:
In [ ]:
In [3]:
# perform PCA dimensionality reduction
pca = sklearn.decomposition.RandomizedPCA(n_components=3).fit(X)
X = pca.transform(X)
#totss = sum(pdist(X)**2)/X.shape[0] # The total sum of squares
#print "totl ss", totss
In [3]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=20).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % num_clusters)
#plt.hist(labels, bins = num_clusters+1)
In [22]:
import sklearn
from sklearn.cluster import KMeans
def get_best_clustering_model(X, max_number_of_clusters):
cost = []
KK = range(1,max_number_of_clusters+1)
kms = []
# calculate all the clustering and cost
for no_of_clusters in KK:
km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
km.fit(X)
kms.append(km)
#centroids = km.cluster_centers_
#distances = cdist(X, centroids, 'euclidean')
#cIdx = np.argmin(distances,axis=1)
#dist = np.min(distances,axis=1)
#tot_withinss = sum(dist**2) # Total within-cluster sum of squares
#cost.append(tot_withinss / X.shape[0])
cost.append(km.inertia_)
# calculate first derivative
derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
#print derivative1
# calculate second derivative
derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
#print derivative2
max2 = argrelextrema(np.argsort(derivative2), np.less)
num_clusters = 4
if(len(max2[0]) > 0):
num_clusters = max2[0][0] + 3
else:
# calculate third derivative
derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
#print derivative3
max3 = argrelextrema(np.argsort(derivative3), np.greater)
if(len(max3[0]) > 0):
num_clusters = max3[0][0] + 4
return kms[num_clusters-1], cost
max_number_of_clusters = 10
model,cost = get_best_clustering_model(X, max_number_of_clusters)
labels = model.predict(X)
num_clusters = model.n_clusters
print "Num clusters:", num_clusters
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
KK = range(1,max_number_of_clusters+1)
# elbow curve
kIdx = num_clusters
clr = cm.spectral( np.linspace(0,1,10) ).tolist()
mrk = 'os^p<dvh8>+x.'
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(KK, cost, 'b*-')
ax.plot(num_clusters, cost[num_clusters-1], marker='o', markersize=14,
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
#ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('Number of clusters')
#plt.ylabel('Percentage of variance explained (%)')
plt.ylabel('Average within sum of squeres')
plt.title('Elbow for KMeans clustering')
color_set = [
[0, 0, 255], #Blue
[255, 0, 0], #Red
[0, 255, 0], #Green
[255, 255, 0], #Yellow
[255, 0, 255], #Magenta
[255, 128, 128], #Pink
[128, 128, 128], #Gray
[128, 0, 0], #Brown
[255, 128, 0], #Orange
]
def get_colors(N=5):
result = []
for x in range(N):
s = color_set[x % len(color_set)]
result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
return result
palette = get_colors(num_clusters+1)
#cluster histogramm
sizes = [0]*model.n_clusters
for i in model.predict(X):
sizes[i] = sizes[i]+1
print sizes
index_max = sizes.index(max(sizes))
cluster_sizes = float(sizes[index_max])/ X.shape[0] * 100
#print cluster_sizes
left = []
for i in range(len(sizes)):
left.append(i)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(left,sizes, color = palette)
X_plot = X[labels >= 0,: ]
labels_plot = labels[labels >= 0]
colors = []
g = 0
for i in labels_plot:
colors.append(palette[i])
#colors.append(palette[g])
#g = g + 1
#if g == 8:
# g = 0
def plot3(indexes):
fig = figure(figsize=(24,24))
ax = fig.gca(projection='3d')
#fig = plt.figure(figsize=(14,14))
#ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel(features[indexes[0]])
ax.set_ylabel(features[indexes[1]])
ax.set_zlabel(features[indexes[2]])
azim = 125
elev = 15
#ax.set_autoscale_on(False)
#ax.axis([0.9, 1.5, -50, 50, 0, 400000])
ax.view_init(elev, azim)
# plot points in 3D
class1 = 0.6 * random.standard_normal((200,3))
x = X_plot[:,indexes[0]]
y = X_plot[:,indexes[1]]
z = X_plot[:,indexes[2]]
#x = random.random(len(colors))
#y = random.random(len(colors))
#z = random.random(len(colors))
ax.scatter(x,y,z, s=5, edgecolors='none', c = colors, marker='o')
#ax.set_xlim([0,10])
#ax.set_ylim([0,1000000])
#ax.set_zlim([0,30])
plot3([0,1,2])
In [23]:
# %load "../src/bothound_tools.py"
"""
Utility class that holds commonly used Bothound functions
"""
import numpy as np
from sklearn.cluster import DBSCAN
import hashlib, hmac
import MySQLdb
from features.src.feature_geo import FeatureGEO
from features.src.feature_deflectee import FeatureDeflectee
from util.crypto import encrypt
import pdb
class BothoundTools():
def connect_to_db(self):
"""
This connetcion to the db will live for the live time of the
learn2bantools instance and will be used to save data back to the db
"""
self.db = MySQLdb.connect(host = self.db_host, user = self.db_user,
passwd = self.db_password,port = self.db_port)
#Create cursor object to allow query execution
self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)
sql = 'CREATE DATABASE IF NOT EXISTS ' + self.db_name
self.cur.execute(sql)
self.db.close()
#Connect directly to DB
self.db = MySQLdb.connect(host = self.db_host, user = self.db_user,
passwd = self.db_password, port = self.db_port, db = self.db_name)
self.cur = self.db.cursor(MySQLdb.cursors.DictCursor)
# ATTACKS table
self.cur.execute("create table IF NOT EXISTS attacks (id INT NOT NULL AUTO_INCREMENT, "
"comment LONGTEXT, "
"PRIMARY KEY(id)) ENGINE=INNODB;")
# INCIDENTS table
self.cur.execute("create table IF NOT EXISTS incidents (id INT NOT NULL AUTO_INCREMENT, "
"id_attack INT,"
"start DATETIME, "
"stop DATETIME, "
"banjax_start DATETIME, "
"banjax_stop DATETIME, "
"comment LONGTEXT, "
"processed BOOL,"
"PRIMARY KEY(id)) "
"ENGINE=INNODB;")
try:
self.cur.execute("ALTER TABLE incidents ADD target LONGTEXT;")
except:
pass
try:
self.cur.execute("ALTER TABLE incidents ADD cluster_index INT;")
except:
pass
# SESSIONS table
self.cur.execute("create table IF NOT EXISTS sessions (id INT NOT NULL AUTO_INCREMENT, "
"id_incident INT NOT NULL, "
"cluster_index INT, "
"IP VARCHAR(45), "
"IP_ENCRYPTED LONGTEXT, "
"IP_IV LONGTEXT, "
"IP_TAG LONGTEXT, "
"request_interval FLOAT, " #Feature Index 1
"ua_change_rate FLOAT, " #Feature Index 2
"html2image_ratio FLOAT, " #Feature Index 3
"variance_request_interval FLOAT, " #Feature Index 4
"payload_average FLOAT, " #Feature Index 5
"error_rate FLOAT, " #Feature Index 6
"request_depth FLOAT, " #Feature Index 7
"request_depth_std FLOAT, " #Feature Index 8
"session_length FLOAT, " #Feature Index 9
"percentage_cons_requests FLOAT," #Feature Index 10
"latitude FLOAT," #Feature Index 11
"longitude FLOAT," #Feature Index 12
"id_country INT," #Feature Index 13
"id_deflectee INT," #Feature Index 14
"PRIMARY KEY(id), INDEX index_incicent (id_incident), "
"FOREIGN KEY (id_incident) REFERENCES incidents(id) ON DELETE CASCADE ) ENGINE=INNODB;")
# CLUSTERS table
self.cur.execute("create table IF NOT EXISTS clusters (id INT NOT NULL AUTO_INCREMENT, "
"id_incident INT NOT NULL, "
"cluster_index INT NOT NULL, "
"comment LONGTEXT, "
"PRIMARY KEY(id), INDEX index_incicent (id_incident), "
"FOREIGN KEY (id_incident) REFERENCES incidents(id) ON DELETE CASCADE ) ENGINE=INNODB;")
# DEFLECTEES table
self.cur.execute("create table IF NOT EXISTS deflectees (id INT NOT NULL AUTO_INCREMENT, "
"domain LONGTEXT, "
"comment LONGTEXT, "
"PRIMARY KEY(id)) ENGINE=INNODB;")
# COUNTRIES table
self.cur.execute("create table IF NOT EXISTS countries (id INT NOT NULL AUTO_INCREMENT, "
"code LONGTEXT, "
"name LONGTEXT, "
"PRIMARY KEY(id)) ENGINE=INNODB;")
# Intersections table
self.cur.execute("create table IF NOT EXISTS intersections (id INT NOT NULL AUTO_INCREMENT, "
"id_incident INT,"
"id_incident2 INT,"
"total INT, "
"intersection FLOAT, " # (length of id_incident)*100/total
"intersection2 FLOAT, " # (length of id_incident2)*100/total
"PRIMARY KEY(id), INDEX index_incicent (id_incident),"
"FOREIGN KEY (id_incident) REFERENCES incidents(id) ON DELETE CASCADE"
") ENGINE=INNODB;")
def get_deflectees(self):
self.cur.execute("select * from deflectees")
return [dict(elem) for elem in self.cur.fetchall()]
def get_countries(self):
self.cur.execute("select * from countries")
return [dict(elem) for elem in self.cur.fetchall()]
"""
Post process features calculated by "lear2bat_feature" class instances
"""
def post_process(self, ip_feature_db):
# factorize the deflectees
ip_feature_db = self.factorize_deflectees(ip_feature_db)
# factorize the deflectees
ip_feature_db = self.factorize_countries(ip_feature_db)
return ip_feature_db
"""
Replace domain string value in ip_feature_db with the appropriate
ID from deflectees table.
Create new rows in deflectees table if necessary
"""
def factorize_deflectees(self, ip_feature_db):
deflectees = self.get_deflectees()
ids = {}
for d in deflectees:
ids[d["domain"]] = d['id']
feature_index = FeatureDeflectee({},{}).get_index()
for ip in ip_feature_db:
features = ip_feature_db[ip]
domain = features[feature_index]
if(isinstance(domain, ( int, long ) ) == True):
continue
if(domain in ids):
features[feature_index] = ids[domain]
else:
self.cur.execute("insert into deflectees(domain) values ('{}')".format(domain))
ids[domain] = self.cur.lastrowid
features[feature_index] = self.cur.lastrowid
self.db.commit()
return ip_feature_db
"""
Replace country string value in ip_feature_db with the appropriate
ID from countreis table.
Create new rows in countreis table if necessary
"""
def factorize_countries(self, ip_feature_db):
countries = self.get_countries()
ids = {}
for c in countries:
ids[c["code"]] = c['id']
feature_index = FeatureGEO({},{}).get_index() + 2
for ip in ip_feature_db:
features = ip_feature_db[ip]
country_code = features[feature_index]
if(isinstance(country_code, ( int, long ) ) == True):
continue
if(country_code in ids):
features[feature_index] = ids[country_code]
else:
self.cur.execute("insert into countries(code) values ('{}')".format(country_code))
ids[country_code] = self.cur.lastrowid
features[feature_index] = self.cur.lastrowid
self.db.commit()
return ip_feature_db
def delete_sessions(self, id_incident):
self.cur.execute("DELETE FROM sessions WHERE id_incident = {0}".format(id_incident))
self.db.commit()
def add_sessions(self, id_incident, ip_feature_db):
for ip in ip_feature_db:
insert_sql = "insert into sessions values (%s,%s,%s,%s,%s,%s,%s"
features = ip_feature_db[ip]
for feature in features:
insert_sql += ",%s"
insert_sql += ")"
features = ip_feature_db[ip]
ip_ascii = ip[0].encode('ascii','ignore')
ip_enctypted = self.encrypt(ip_ascii)
ip_hash = self.hash(ip_ascii)
values = [0,id_incident,0, ip_hash, ip_enctypted[0], ip_enctypted[1], ip_enctypted[2]]
for feature in features:
values.append(features[feature])
self.cur.execute(insert_sql, values)
self.db.commit()
def get_sessions(self, id_incident):
self.cur.execute("select * from sessions WHERE id_incident = {0}".format(id_incident))
return [dict(elem) for elem in self.cur.fetchall()]
def set_incident_processed(self, id, processed):
sql = "update incidents set processed={} WHERE id = {}".format(processed,id)
self.cur.execute(sql)
self.db.commit()
def get_incidents(self, processed):
self.cur.execute("select * from incidents WHERE "
"cast(processed as unsigned) = %d" % (1 if processed else 0))
return [dict(elem) for elem in self.cur.fetchall()]
def get_incident(self, id):
self.cur.execute("select * from incidents WHERE id = %d" % id)
incident = None
for row in self.cur.fetchall():
incident = row
return incident
def get_processed_incidents(self):
return self.get_incidents(True)
def get_not_processed_incidents(self):
return self.get_incidents(False)
def update_geo(self, id_incident):
self.cur.execute("select id, ip from sessions WHERE id_incident = {0}".format(id_incident))
rows = self.cur.fetchall();
for row in rows:
match = FeatureGEO.find_location(row['ip'])
sql = "update sessions set latitude={}, longitude={}, country='{}' WHERE id = {}".format(match['latitude'], match['longitude'], match['country'], row['id'])
self.cur.execute(sql)
self.db.commit()
return
def disconnect_from_db(self):
"""
Close connection to the database
"""
self.cur.close()
self.db.close()
def load_database_config(self, database_conf, elastic_db_conf):
self.db_user = database_conf["user"]
self.db_password = database_conf["password"]
self.db_host = database_conf["host"]
self.db_name = database_conf["name"]
if("port" in database_conf):
self.db_port = database_conf["port"]
else:
self.db_port = 3306
self.db_encryption_key = hashlib.sha256(database_conf["encryption_passphrase"]).digest()
self.db_hash_key = hashlib.sha256(database_conf["hash_passphrase"]).digest()
#read elastic search user and password
self.es_user = elastic_db_conf["user"]
self.es_password = elastic_db_conf["password"]
self.es_host = elastic_db_conf["host"]
if("port" in elastic_db_conf):
self.es_port = elastic_db_conf["port"]
else:
self.es_port = 9200
def random_slicer(self, data_size, train_portion=0.5):
"""
Return two arrays with random true and false and complement of each
other, used for slicing a set into trainig and testing
INPUT:
data_size: size of the array to return
train_portion: between 0,1 indicate the portion for the True
entry
"""
from random import random
random_selector = [random() < train_portion for i in range(0, data_size)]
complement_selector = np.logical_not(random_selector)
return random_selector, complement_selector
"""
create a test incident and all the sessions from
../data/feature_db-files.txt
"""
def get_test_incident(self):
test_comment = 'Test incident'
#check if the test incident exists
self.cur.execute("select id from incidents WHERE comment = '{0}'".format(test_comment))
for row in self.cur.fetchall():
print "test incident exists", row['id']
return row['id']
print "creating test incident..."
#new incident record
sql = "insert into incidents(comment) VALUES('%s')" % (test_comment)
self.cur.execute(sql)
id_incident = self.cur.lastrowid
print "id_incident", id_incident
filename = '../data/feature_db-files.txt'
file = open(filename)
line_number = 15000
for line in file:
splitted_line = line.split(') {')
useful_part = splitted_line[1]
useful_part = useful_part[:-2]
new_split = useful_part.split(', ')
insert_sql = "insert into sessions values (NULL," + str(id_incident) + ", 0, "
insert_sql += "\"" + str(line_number) + "\","
line_number = line_number + 1
insert_sql += "0,0,0,"
for b in new_split:
c = b.split(': ')[1]
insert_sql += str(c) + ","
insert_sql += "0,0,0,0,"
insert_sql = insert_sql[:-1]
insert_sql += ");"
self.cur.execute(insert_sql)
self.db.commit()
print "done."
return id_incident
"""
Cluster the sessions in the incident.
Update cluster_index in session table
Return the sessions with the calculated cluster_index
"""
def cluster(self, id_incident):
sessions = self.get_sessions(id_incident)
features = [
"request_interval",
"ua_change_rate",
"html2image_ratio",
"variance_request_interval",
"payload_average",
"error_rate",
"request_depth",
"request_depth_std",
"session_length",
"percentage_cons_requests",
#"latitude",
#"longitude"
]
data_set = []
for session in sessions:
values = []
for feature in features:
values.append(session[feature])
data_set.append(values)
if len(data_set) == 0 :
return
X = np.array(data_set)
# Compute DBSCAN
db = DBSCAN(eps=0.1, min_samples=20).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
#update the cluster column in session table
for session, label in zip(sessions, labels):
session["cluster_index"] = label
self.cur.execute('update sessions set cluster_index ={0} '
'where id={1}'.format(label, session['id']))
self.db.commit()
return sessions
def encrypt(self, data):
return encrypt(self.db_encryption_key, data, "")
def hash(self, data):
return hmac.new(data, self.db_hash_key, hashlib.sha256).digest()
def encrypt_and_hash(self, data):
"""
This is mainly for storing IPs so we don't store them
in plain, we use hash so each ip converts to the same
hash so we can get all the sessions related to an ip
without knowing the ip
INPUT:: a string containing the sensetive data
OUTPUT:: (encrypted_sensetive_data,
keyed_hash_of_sensetive_data)
"""
return (self.encrypt(data), self.hash(data))
def calculate_intersection(self, id_incident, id_incident2):
# delete the previous calculations
self.cur.execute("DELETE FROM intersections WHERE id_incident = {0}".format(id_incident))
self.db.commit()
self.cur.execute("select IP from sessions WHERE id_incident = {0}".format(id_incident))
ips = [elem["IP"] for elem in self.cur.fetchall()]
self.cur.execute("select IP from sessions WHERE id_incident = {0}".format(id_incident2))
ips2 = [elem["IP"] for elem in self.cur.fetchall()]
total = len(set(ips).intersection(ips2))
#update the table
sql = """INSERT INTO intersections (`id`, `id_incident`, `id_incident2`, `total`,
`intersection`, `intersection2`) VALUES ({},{},{},{},{},{})""".format(0,
id_incident, id_incident2, total, total*100.0/len(ips), total*100.0/len(ips2))
self.cur.execute(sql)
self.db.commit()
return
def calculate_all_intersections(self, id_incident):
self.cur.execute("select * from incidents where id != {}".format(id_incident))
for incident in self.cur.fetchall():
self.calculate_intersection(id_incident, incident["id"])
pass
def __init__(self, conf):
#we would like people to able to use the tool object even
#if they don't have a db so we have no reason to load this
#config in the constructor
self.load_database_config(conf["database"], conf["elastic_db"])
In [5]:
plot3([1,3,4])
In [ ]: