In [83]:
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq
# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram
# enabling folding extension. Run it once.
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')
import yaml
from bothound_tools import BothoundTools
color_set = [
[0, 0, 255], #Blue
[255, 0, 0], #Red
[0, 255, 0], #Green
[255, 255, 0], #Yellow
[255, 0, 255], #Magenta
[255, 128, 128], #Pink
[128, 128, 128], #Gray
[128, 0, 0], #Brown
[255, 128, 0], #Orange
]
stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()
def get_palette(N=5):
result = []
for x in range(N):
s = color_set[x % len(color_set)]
result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
return result
palette = get_palette(80)
def plot_costs(costs, num_clusters, title):
KK = range(1,len(costs)+1)
# elbow curve
kIdx = num_clusters
clr = cm.spectral( np.linspace(0,1,10) ).tolist()
mrk = 'os^p<dvh8>+x.'
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(KK, costs, 'b*-')
ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14,
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
#ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within sum of squeres')
plt.title(title)
def plot_clusters(clusters, num_clusters,title="Histogram"):
sizes = [0]*num_clusters
for i in clusters:
if(i >= 0) :
if (i >= num_clusters):
print i
sizes[i] = sizes[i]+1
print (sizes)
#plot histogramm
left = []
for i in range(len(sizes)):
left.append(i-0.5)
fig = plt.figure(figsize=(12,8))
plt.title(title)
ax = fig.add_subplot(111)
ax.bar(left,sizes, color = palette)
def plot_cluster_feature(clusters, num_clusters, X, feature_index):
sizes = [0]*num_clusters
sums = [0]*num_clusters
values = [0]*num_clusters
for d, cluster in zip(X,clusters):
if (cluster >= num_clusters):
break
sizes[cluster] = sizes[cluster] + 1
sums[cluster] = sums[cluster] + d[feature_index]
for i in range(0,len(sizes)):
if sizes[i] > 0:
values[i] = sums[i] / sizes[i]
print (values)
#plot histogramm
left = []
for i in range(len(values)):
left.append(i-0.5)
fig = plt.figure(figsize=(12,8))
plt.title(title)
ax = fig.add_subplot(111)
ax.bar(left,values, color = palette)
def get_clustering_model(X, num_clusters):
model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
model.fit(X)
clusters = model.predict(X)
plot_clusters(clusters, num_clusters)
return clusters
def get_best_clustering_model(X, max_number_of_clusters, title):
cost = []
KK = range(1,max_number_of_clusters+1)
kms = []
# calculate all the clustering and cost
for no_of_clusters in KK:
km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
km.fit(X)
kms.append(km)
sizes = [0]*no_of_clusters
for i in km.predict(X):
if(i >= no_of_clusters):
print i
sizes[i] = sizes[i]+1
print (sizes)
cost.append(km.inertia_)
# calculate first derivative
derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
#print "d1", derivative1
# calculate second derivative
derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
#print "d2", derivative2
max2 = argrelextrema(np.argsort(derivative2), np.less)
num_clusters = 4
#print "max2", max2
if(len(max2[0]) > 0):
num_clusters = max2[0][0] + 3
else:
# calculate third derivative
derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
#print derivative3
max3 = argrelextrema(np.argsort(derivative3), np.greater)
if(len(max3[0]) > 0):
num_clusters = max3[0][0] + 4
model = kms[num_clusters-1]
# plot costs
plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)
clusters = model.predict(X)
plot_clusters(clusters, model.n_clusters, title)
return clusters, model.n_clusters, cost
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook
from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go
def plot3(feature_indexes, X, clusters, selected_clusters, title = "Cluster", cluster_titles = []):
clusters_plot = []
num_clusters = max(clusters)+1
for i in range(0, num_clusters):
if len(selected_clusters) > 0 and (i not in selected_clusters):
continue
d = X[clusters == i,: ]
cluster = Scatter3d(
x=d[:,feature_indexes[0]],
y=d[:,feature_indexes[1]],
z=d[:,feature_indexes[2]],
mode='markers',
name = cluster_titles[i] if i < len(cluster_titles) else '{}'.format(i),
marker=dict(
color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
size=12,
line=dict(
color='rgb(204, 204, 204)',
width=0.0
),
opacity=0.5
)
)
clusters_plot.append(cluster)
data = Data(clusters_plot)
bk_color = "rgb(224, 224, 224)"
layout = go.Layout(
margin=dict(l=0, r=0, b=0,t=60),
title=title,
height = 1000,
width = 1000,
scene=go.Scene(
xaxis=dict(
title = features[feature_indexes[0]],
showbackground=True, # (!) show axis background
backgroundcolor=bk_color, # set background color to grey
gridcolor="rgb(255, 255, 255)", # set grid line color
zerolinecolor="rgb(255, 255, 255)", # set zero grid line color
),
yaxis=dict(
title = features[feature_indexes[1]],
showbackground=True, # (!) show axis background
backgroundcolor=bk_color, # set background color to grey
gridcolor="rgb(255, 255, 255)", # set grid line color
zerolinecolor="rgb(255, 255, 255)", # set zero grid line color
),
zaxis=dict(
title = features[feature_indexes[2]],
showbackground=True, # (!) show axis background
backgroundcolor=bk_color, # set background color to grey
gridcolor="rgb(255, 255, 255)", # set grid line color
zerolinecolor="rgb(255, 255, 255)", # set zero grid line color
)
),
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, cluster2 = -1):
clusters_np = np.array(clusters)
ips_np = np.array(ips)
ips2 = set(tools.get_ips(id_incident2, cluster2))
d = {}
d["Cluster"] = []
d["Incident"] = []
d["data"] = []
percentages = []
intersections = []
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["Incident"].append("Unique from incident {}".format(id_incident))
cluster_ips = set(ips_np[clusters_np == cluster])
intersection = len(ips2.intersection(cluster_ips))
intersections.append(intersection)
d["data"].append(len(cluster_ips)-intersection)
if(len(cluster_ips) == 0):
percentages.append(0)
else:
percentages.append(intersection*100.0/len(cluster_ips))
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["Incident"].append("Intersection with incident {}".format(id_incident2))
d["data"].append(intersections[cluster])
df=pd.DataFrame(d)
p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right',
title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, cluster2) ,
ylabel = "IP sessions", plot_width=1000, plot_height=600)
bk.show(p)
def plot_countries(clusters, num_clusters, sessions, num_countries = 10):
countries = tools.get_countries()
ids = np.array([s['id_country'] for s in sessions])
#first find the best countries
if(num_countries > len(countries)):
num_countries = len(countries)
# find the most ccountries count in total
freq = itemfreq(ids)
sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True)
best_countries = sorted_countries[0:num_countries]
codes = []
for i in range(0,len(best_countries)):
c = (item for item in countries if item["id"] == best_countries[i][0]).next()
codes.append(c)
# calculate best countries count per cluster
clusters_np = np.array(clusters)
d = {}
d["Cluster"] = []
d["Country"] = []
d["data"] = []
freqs= []
for cluster in range(0, num_clusters):
ids_cluster = ids[clusters_np == cluster]
freqs.append(itemfreq(ids_cluster))
for i in range(0,len(best_countries)):
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["Country"].append(codes[i]["name"])
exists = False
for f in freqs[cluster]:
if (f[0] == best_countries[i][0]):
d["data"].append(f[1])
exists = True
break
if (not exists) :
d["data"].append(0)
df=pd.DataFrame(d)
p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right',
title = "Countries" ,
ylabel = "IP sessions", plot_width=1000, plot_height=600)
bk.show(p)
#print d
def plot_ban(clusters, num_clusters, sessions):
clusters_np = np.array(clusters)
bans = np.array([s['ban'] for s in sessions])
d = {}
d["Cluster"] = []
d["Ban"] = []
d["data"] = []
banned = []
percentage = []
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["Ban"].append("Served")
cluster_total = bans[clusters_np == cluster]
cluster_banned = cluster_total[cluster_total==1]
banned.append(cluster_banned.shape[0])
if (cluster_total.shape[0] == 0):
p = 0
else:
p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
percentage.append(p)
d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["Ban"].append("Banned")
d["data"].append(banned[cluster])
df=pd.DataFrame(d)
p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right',
title = "Banjax Ban" ,
ylabel = "IP sessions", plot_width=1000, plot_height=600)
bk.show(p)
print banned
print percentage
# plot selected clusters from different incidents
"""
features = [
"request_interval", #1
"ua_change_rate",#2
"html2image_ratio",#3
"variance_request_interval",#4
"payload_average",#5
"error_rate",#6
"request_depth",#7
"request_depth_std",#8
"session_length",#9
"percentage_cons_requests",#10
]
"""
def plot_incidents(id_incidents, features = [
"html2image_ratio",#3
"variance_request_interval",#4
"error_rate",#6
]):
incidents = []
values = []
incident_indexes = []
i = 1
titles = []
for id in id_incidents:
print "Indicent", id, "loading..."
sessions = tools.get_sessions(id)
incident = tools.get_incident(id)[0]
for s in sessions:
if(s['cluster_index'] != incident['cluster_index']):
continue
row = []
for f in features:
row.append(s[f])
incident_indexes.append(i-1)
values.append(row)
titles.append("Incident {}, cluster {}".format(id, incident['cluster_index']))
i = i + 1
X = np.array(values)
incident_indexes = np.array(incident_indexes)
plot3([0,1,2], X, incident_indexes, -1, "Incident clusters", titles)
In [99]:
#Report Configuration
id_incident = 29
features = [
"request_interval", #1
"ua_change_rate",#2
"html2image_ratio",#3
"variance_request_interval",#4
#"payload_average",#5
#"error_rate",#6
"request_depth",#7
"request_depth_std",#8
"session_length",#9
#"percentage_cons_requests",#10
]
max_number_of_clusters = 15
In [100]:
# Reading from Database
incident = None
sessions = None
incident = tools.get_incident(id_incident)
sessions = tools.get_sessions(id_incident)
#tools.disconnect_from_db()
print ("Incident {} loaded:".format(id_incident))
print ("Start: {}".format(incident[0]["start"]))
print ("Stop : {}".format(incident[0]["stop"]))
#print ("Comment : {}".format(incident[0]["comment"]))
values = []
ips = []
for s in sessions:
row = []
for f in features:
row.append(s[f])
values.append(row)
ips.append(s["IP"])
X = np.array(values)
print (X.shape)
# normalization
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)
In [84]:
#id_incidents = [24,25,26,19,27] # Kotsubynske
#id_incidents = [19]
id_incidents = [29,30,31,32,33,34] #Bdsmovemenet
#id_incidents = [31,32,33,34] #Bdsmovemenet, last 4 incidents
sessions = []
values = []
for id in id_incidents:
print "Indicent", id, "loading..."
s = tools.get_sessions(id)
sessions = sessions + s
for s in sessions:
row = []
for f in features:
row.append(s[f])
values.append(row)
X = np.array(values)
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)
print (X.shape)
print "Done."
In [130]:
# perform PCA dimensionality reduction
#PCA to 3 dimensions for visualisation
pca = sklearn.decomposition.RandomizedPCA(n_components=3).fit(X)
X = pca.transform(X)
# elbow methos fpr PCA
#clusters, num_clusters, costs_pca = get_best_clustering_model(X, max_number_of_clusters, "PCA")
#print ("Num clusters(PCA):", num_clusters)
#plot3([0,1,2], X, clusters, -1)
In [104]:
# DBSCAN clustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
clusters = db.labels_.astype(int)
#clusters_dbscan = clusters_dbscan.tolist()
clusters = clusters + 1
# Number of clusters in labels, ignoring noise if present.
num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
plot_clusters(clusters, num_clusters)
print('Estimated number of clusters: %d' % num_clusters)
#plot3([0,1,2], X, clusters, -1)
In [107]:
#plot3([1,0,3], X, clusters, -1)
plot3([2,0,1], X, clusters, [1,2])
In [131]:
# K-Means
random.seed(666)
clusters, num_clusters, costs = get_best_clustering_model(X, max_number_of_clusters, "")
print ("Num clusters:", num_clusters)
#plot3([0,1,2], X, clusters, -1)
In [133]:
plot3([2,0,3], X, clusters, -1)
In [57]:
# custom number of clusters
num_clusters = 6
clusters = get_clustering_model(X, num_clusters)
plot3([2,0,3], X, clusters, -1)
In [26]:
target_cluster = 1
num_clusters = 5
X2 = X[clusters == target_cluster]
sessions2 = []
for s,cluster in zip(sessions,clusters):
if(cluster == target_cluster):
sessions2.append(s)
clusters2 = get_clustering_model(X2, num_clusters)
In [103]:
plot3([2,0,1], X2, clusters2, -1)
In [ ]:
# saving clustering
tools.save_clustering(sessions, clusters)
if clusters2 is not None and len(clusters2)>0:
tools.save_clustering2(sessions2, clusters2)
In [30]:
tools.clear_attack(d_incident)
tools.label_attack(id_incident, 1, [1], [0,2])
In [85]:
plot_cluster_feature(clusters, 10, X,0)
In [96]:
def box_plot_feature(clusters, num_clusters, X, feature_index):
traces = []
for i in range(0,num_clusters):
traces.append(go.Box(
y = X[clusters == i][feature_index],
boxpoints='all',
jitter=0.5,
name='{}'.format(i),
pointpos=-1.8,
))
data = Data(traces)
layout = go.Layout(
showlegend=False,
height = 900,
title='Feature {}'.format(features[feature_index]),
xaxis=go.XAxis(
showgrid=True,
showline=True,
ticks=''
),
yaxis=go.YAxis(
showline=True,
ticks='',
zeroline=True,
#range = [0,300],
title = "Value"
)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [97]:
box_plot_feature(clusters, 10, X,3)
IP intersection with anothe incident
In [13]:
# Intersection with another incident(and cluster)
plot_intersection(clusters, num_clusters, id_incident, ips, 34, -1)
In [37]:
#num_clusters = 14
#clusters = get_clustering_model(X, num_clusters)
plot_countries(clusters, num_clusters, sessions, 8)
In [54]:
# banjax ban feature
plot_ban(clusters, num_clusters, sessions)
In [115]:
#ua
def plot_ua(clusters, num_clusters, sessions):
clusters_np = np.array(clusters)
#uas = np.array([s['ua'] == 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)' for s in sessions])
uas = np.array([ s['ua'].find("WordPress") >= 0 if s['ua'] != None else False for s in sessions])
d = {}
d["Cluster"] = []
d["UserAgent"] = []
d["data"] = []
ua = []
percentage = []
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["UserAgent"].append("Other")
cluster_total = uas[clusters_np == cluster]
cluster_ua = cluster_total[cluster_total==1]
ua.append(cluster_ua.shape[0])
if (cluster_total.shape[0] == 0):
p = 0
else:
p = float("{0:.2f}".format(cluster_ua.shape[0]*100.0/cluster_total.shape[0]))
percentage.append(p)
d["data"].append(cluster_total.shape[0]-cluster_ua.shape[0])
for cluster in range(0, num_clusters):
d["Cluster"].append(cluster)
d["UserAgent"].append(" Suspected User Agent")
d["data"].append(ua[cluster])
df=pd.DataFrame(d)
p=Bar(df,label='Cluster',values='data',stack='UserAgent',legend='top_right',
title = "User agent in clusters" ,
ylabel = "IP sessions", plot_width=1000, plot_height=600)
bk.show(p)
print banned
print percentage
In [116]:
plot_ua(clusters, num_clusters, sessions)
In [91]:
print len(sessions)
In [45]:
# ban
banned = []
for s in sessions:
if(s['ban'] == 1):
banned.append(1)
else:
banned.append(0)
banned = np.array(banned)
#print X[banned==1,:]
plot3([4,1,3], X, banned, -1, "Banned IPs vs Regular IPs")
In [2]:
tools.calculate_all_intersections(19)
In [74]:
plot_incidents([19,27], features = [
"request_interval", #1
#"ua_change_rate",#2
"html2image_ratio",#3
"variance_request_interval",#4
#"payload_average",#5
"error_rate",#6
#"request_depth",#7
#"request_depth_std",#8
#"session_length",#9
#"percentage_cons_requests",#10
])
In [ ]: