In [105]:
    
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq
# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram
# enabling folding extension. Run it once.
ext_require_path = 'usability/codefolding/main'
from notebook.nbextensions import EnableNBExtensionApp
if hasattr(EnableNBExtensionApp(), 'enable_nbextension'):
    EnableNBExtensionApp().enable_nbextension(ext_require_path)
else:
    from notebook.nbextensions import enable_nbextension
    enable_nbextension('notebook', ext_require_path)
    
import yaml
from bothound_tools import BothoundTools
color_set = [
    [0, 0, 255],      #Blue
    [255, 0, 0],      #Red
    [0, 255, 0],      #Green
    [255, 255, 0],    #Yellow
    [255, 0, 255],    #Magenta
    [255, 128, 128],  #Pink
    [128, 128, 128],  #Gray
    [128, 0, 0],      #Brown
    [255, 128, 0],    #Orange
]
stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()
def get_palette(N=5):
    result = []
    for x in range(N):
        s = color_set[x % len(color_set)]
        result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
    return result
palette = get_palette(80)
def plot_costs(costs, num_clusters, title):
    KK = range(1,len(costs)+1)
    # elbow curve
    kIdx = num_clusters      
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    mrk = 'os^p<dvh8>+x.'
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, costs, 'b*-')
    ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14, 
        markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    #ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within sum of squeres')
    plt.title(title)
    
def plot_clusters(clusters, num_clusters,title="Histogram"):
    sizes = [0]*num_clusters
    for i in clusters: 
        if(i >= 0) :
            if (i >= num_clusters):
                print i
            sizes[i] = sizes[i]+1
    print (sizes)
    
      #plot histogramm
    left = [] 
    for i in range(len(sizes)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,sizes, color = palette)            
    
def plot_cluster_feature(clusters, num_clusters, X, feature_index):
    sizes = [0]*num_clusters
    sums = [0]*num_clusters
    values = [0]*num_clusters
    for d, cluster in zip(X,clusters):
        if (cluster >= num_clusters):
            break
        sizes[cluster] = sizes[cluster] + 1
        sums[cluster] = sums[cluster] + d[feature_index]
    for i in range(0,len(sizes)):
        if sizes[i] > 0:
            values[i] = sums[i] / sizes[i]
    print (values)
    
      #plot histogramm
    left = [] 
    for i in range(len(values)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,values, color = palette)            
    
def get_clustering_model(X, num_clusters):
    model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
    model.fit(X)
    
    clusters = model.predict(X)
    plot_clusters(clusters, num_clusters)
    return clusters
def get_best_clustering_model(X, max_number_of_clusters, title):
    cost = []
    KK = range(1,max_number_of_clusters+1)
    kms = []
    # calculate all the clustering and cost
    for no_of_clusters in KK:
        km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
        km.fit(X)
        kms.append(km)
        sizes = [0]*no_of_clusters
        for i in km.predict(X): 
            if(i >= no_of_clusters):
                print i
            sizes[i] = sizes[i]+1
        print (sizes)
        cost.append(km.inertia_)
    # calculate first derivative
    derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
    #print "d1", derivative1
    # calculate second derivative
    derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
    #print "d2", derivative2
    max2 = argrelextrema(np.argsort(derivative2), np.less) 
    num_clusters = 4 
    #print "max2", max2
    if(len(max2[0]) > 0):
        num_clusters = max2[0][0] + 3
    else:
        # calculate third derivative
        derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
        #print derivative3
        max3 = argrelextrema(np.argsort(derivative3), np.greater) 
        if(len(max3[0]) > 0):
            num_clusters = max3[0][0] + 4 
    model = kms[num_clusters-1]
    
    # plot costs
    plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)
    clusters = model.predict(X)
    plot_clusters(clusters, model.n_clusters, title)
    return clusters, model.n_clusters, cost
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook
from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go
def plot3(feature_indexes, X, clusters, selected_clusters, title = "Cluster", cluster_titles = []):
    clusters_plot = []
    num_clusters = max(clusters)+1
    for i in range(0, num_clusters):
        if len(selected_clusters) > 0 and (i not in selected_clusters):
            continue
        d = X[clusters == i,: ]
        cluster = Scatter3d(
            x=d[:,feature_indexes[0]],
            y=d[:,feature_indexes[1]],
            z=d[:,feature_indexes[2]],
            mode='markers',
            name = cluster_titles[i] if i < len(cluster_titles) else '{}'.format(i),
            marker=dict(
                color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
                size=12,
                line=dict(
                    color='rgb(204, 204, 204)',
                    width=0.0
                ),
                opacity=0.5
            )
        )
        clusters_plot.append(cluster)
    data = Data(clusters_plot)
    bk_color = "rgb(224, 224, 224)"
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0,t=60),
        title=title, 
        height = 1000,
        width = 1000,
        scene=go.Scene(
            xaxis=dict(
                title = features[feature_indexes[0]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
           ),
            yaxis=dict(
                 title = features[feature_indexes[1]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            ),
            zaxis=dict(
                 title = features[feature_indexes[2]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            )
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)
def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, attack2 = -1):
    clusters_np = np.array(clusters)
    ips_np = np.array(ips)
    ips2 = set(tools.get_ips(id_incident2, attack2))
    d = {}
    d["Cluster"] = []
    d["Incident"] = []
    d["data"] = []
    percentages = []
    intersections = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Unique from incident {}".format(id_incident))
        cluster_ips = set(ips_np[clusters_np == cluster])
        intersection = len(ips2.intersection(cluster_ips))
        intersections.append(intersection)
        d["data"].append(len(cluster_ips)-intersection)
        if(len(cluster_ips) == 0):
            percentages.append(0)
        else:
            percentages.append(intersection*100.0/len(cluster_ips))
        
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Intersection with incident {}".format(id_incident2))
        d["data"].append(intersections[cluster])
        
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right', 
          title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, attack2) ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
def plot_countries(clusters, sessions, num_clusters, num_countries = 10):
    countries = tools.get_countries()
    ids = np.array([s['id_country'] for s in sessions])
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)
    # calculate best countries count per cluster
    clusters_np = np.array(clusters)
    d = {}
    d["Cluster"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for cluster in range(0, num_clusters):
        ids_cluster = ids[clusters_np == cluster]
        freqs.append(itemfreq(ids_cluster))
        
    for i in range(0,len(best_countries)):
        for cluster in range(0, num_clusters):
            d["Cluster"].append(cluster)
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[cluster]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p) 
    #print d
    
def plot_ban(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    bans = np.array([s['ban'] for s in sessions])
    d = {}
    d["Cluster"] = []
    d["Ban"] = []
    d["data"] = []
    banned = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Served")
        cluster_total = bans[clusters_np == cluster]
        cluster_banned = cluster_total[cluster_total==1]
        banned.append(cluster_banned.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Banned")
        d["data"].append(banned[cluster])
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right', 
          title = "Banjax Ban" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage
# plot selected clusters from different incidents
"""
features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    "payload_average",#5
    "error_rate",#6
    "request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    "percentage_cons_requests",#10
]
"""
def plot_incidents(id_incidents, features = [
    "html2image_ratio",#3
    "variance_request_interval",#4
    "error_rate",#6
]):
    
    incidents = []
    values = []
    incident_indexes = []
    i = 1
    titles = []
    for id in id_incidents:
        print "Indicent", id, "loading..."
        sessions = tools.get_sessions(id)
        incident = tools.get_incident(id)[0]
        for s in sessions:
            if(s['cluster_index'] != incident['cluster_index']):
                continue
            row = []
            for f in features:
                row.append(s[f])
            incident_indexes.append(i-1) 
            values.append(row)
        titles.append("Incident {}, cluster {}".format(id, incident['cluster_index']))
        i = i + 1
    X = np.array(values)
    incident_indexes = np.array(incident_indexes)
    plot3([0,1,2], X, incident_indexes, -1, "Incident clusters", titles)
    
def box_plot_feature(clusters, num_clusters, X, feature_index):
    traces = []
    for i in range(0,num_clusters):
        traces.append(go.Box(
            y = X[clusters == i][feature_index],
            boxpoints='all',
            jitter=0.5,
            name='{}'.format(i),
            pointpos=-1.8,
            
        ))
        
    data = Data(traces)
    layout = go.Layout(
        showlegend=False,
        height = 900,
        title='Feature {}'.format(features[feature_index]),
        xaxis=go.XAxis(
            showgrid=True,
            showline=True,
            ticks=''
        ),
        yaxis=go.YAxis(
            showline=True,
            ticks='',
            zeroline=True,
            #range = [0,300],
            title = "Value"
        )
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)
    
    
    
    
    
In [150]:
    
#Report Configuration
id_incident = 50
features = [
    "request_interval", #1
    #"ua_change_rate",#2
    #"html2image_ratio",#3
    "variance_request_interval",#4
    #"payload_average",#5
    #"error_rate",#6
    "request_depth",#7
    #"request_depth_std",#8
    #"session_length",#9
    "percentage_cons_requests",#10
]
max_number_of_clusters = 15
    
In [151]:
    
# Reading from Database
incident = None
sessions = None
incident = tools.get_incident(id_incident)
sessions = tools.get_sessions(id_incident)
#tools.disconnect_from_db()
print ("Incident {} loaded:".format(id_incident))
print ("Start: {}".format(incident[0]["start"]))
print ("Stop : {}".format(incident[0]["stop"]))
#print ("Comment : {}".format(incident[0]["comment"]))
values = []
ips = []
for s in sessions:
    row = []
    for f in features:
        row.append(s[f])
    values.append(row)
    ips.append(s["IP"])
X = np.array(values)
print (X.shape)
# normalization 
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)
clusters2 = None
    
    
In [152]:
    
# DBSCAN clustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
clusters = db.labels_.astype(int)
#clusters_dbscan = clusters_dbscan.tolist()
clusters = clusters + 1
# Number of clusters in labels, ignoring noise if present.
num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
plot_clusters(clusters, num_clusters)
print('Estimated number of clusters: %d' % num_clusters)
#plot3([0,1,2], X, clusters, -1)
    
    
    
In [153]:
    
#plot3([1,0,3], X, clusters, -1)
plot3([0,1,2], X, clusters, [])
    
    
In [9]:
    
# Choose your target cluster for double clustering
target_cluster = 1
# Choose the number of clusters for K-means
num_clusters = 5
X2 = X[clusters == target_cluster]
sessions2 = []
for s,cluster in zip(sessions,clusters):
    if(cluster == target_cluster):
        sessions2.append(s)
clusters2 = get_clustering_model(X2, num_clusters)
    
    
    
In [10]:
    
plot3([0,1,3], X2, clusters2, [])
    
    
In [141]:
    
    
In [154]:
    
box_plot_feature(clusters, num_clusters = 5, X = X, feature_index =0)
box_plot_feature(clusters, num_clusters = 5, X = X, feature_index =1)
box_plot_feature(clusters, num_clusters = 5, X = X, feature_index =2)
box_plot_feature(clusters, num_clusters = 5, X = X, feature_index =3)
    
    
    
    
    
IP intersection with anothe incident/attack
In [17]:
    
# Intersection with another incident(and attack)
plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2 = 33, attack2 = -1)
    
    
    
    
In [155]:
    
#num_clusters = 14
#clusters = get_clustering_model(X, num_clusters)
plot_countries(clusters, sessions, num_clusters = 8, num_countries = 8)
    
    
    
    
In [156]:
    
# banjax ban feature
plot_ban(clusters, num_clusters, sessions)
    
    
    
    
    
In [157]:
    
# saving clustering
tools.save_clustering(sessions, clusters)
# saving double clustering
if clusters2 is not None and len(clusters2)>0:
    tools.save_clustering2(sessions2, clusters2)
# clear the attack in db
tools.clear_attack(id_incident)
# save the first attack
tools.label_attack(id_incident, attack_number = id_incident, selected_clusters = [1,2], selected_clusters2 = [])
# save the second attack
#tools.label_attack(id_incident, attack_number = 2, selected_clusters = [2], selected_clusters2 = [])