Clustering

Bothound project

Initialization


In [83]:
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq

# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram

# enabling folding extension. Run it once.
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')

import yaml
from bothound_tools import BothoundTools

color_set = [
    [0, 0, 255],      #Blue
    [255, 0, 0],      #Red
    [0, 255, 0],      #Green
    [255, 255, 0],    #Yellow
    [255, 0, 255],    #Magenta
    [255, 128, 128],  #Pink
    [128, 128, 128],  #Gray
    [128, 0, 0],      #Brown
    [255, 128, 0],    #Orange
]

stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()

def get_palette(N=5):
    result = []
    for x in range(N):
        s = color_set[x % len(color_set)]
        result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
    return result
palette = get_palette(80)

def plot_costs(costs, num_clusters, title):
    KK = range(1,len(costs)+1)

    # elbow curve
    kIdx = num_clusters      
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    mrk = 'os^p<dvh8>+x.'

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, costs, 'b*-')
    ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14, 
        markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    #ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within sum of squeres')
    plt.title(title)
    
def plot_clusters(clusters, num_clusters,title="Histogram"):
    sizes = [0]*num_clusters
    for i in clusters: 
        if(i >= 0) :
            if (i >= num_clusters):
                print i
            sizes[i] = sizes[i]+1
    print (sizes)
    
      #plot histogramm
    left = [] 
    for i in range(len(sizes)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,sizes, color = palette)            
    
def plot_cluster_feature(clusters, num_clusters, X, feature_index):
    sizes = [0]*num_clusters
    sums = [0]*num_clusters
    values = [0]*num_clusters
    for d, cluster in zip(X,clusters):
        if (cluster >= num_clusters):
            break
        sizes[cluster] = sizes[cluster] + 1
        sums[cluster] = sums[cluster] + d[feature_index]
    for i in range(0,len(sizes)):
        if sizes[i] > 0:
            values[i] = sums[i] / sizes[i]
    print (values)
    
      #plot histogramm
    left = [] 
    for i in range(len(values)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,values, color = palette)            
    
def get_clustering_model(X, num_clusters):
    model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
    model.fit(X)
    
    clusters = model.predict(X)
    plot_clusters(clusters, num_clusters)
    return clusters

def get_best_clustering_model(X, max_number_of_clusters, title):
    cost = []
    KK = range(1,max_number_of_clusters+1)
    kms = []
    # calculate all the clustering and cost
    for no_of_clusters in KK:
        km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
        km.fit(X)
        kms.append(km)

        sizes = [0]*no_of_clusters
        for i in km.predict(X): 
            if(i >= no_of_clusters):
                print i
            sizes[i] = sizes[i]+1
        print (sizes)

        cost.append(km.inertia_)

    # calculate first derivative
    derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
    #print "d1", derivative1

    # calculate second derivative
    derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
    #print "d2", derivative2

    max2 = argrelextrema(np.argsort(derivative2), np.less) 
    num_clusters = 4 
    #print "max2", max2
    if(len(max2[0]) > 0):
        num_clusters = max2[0][0] + 3
    else:
        # calculate third derivative
        derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
        #print derivative3

        max3 = argrelextrema(np.argsort(derivative3), np.greater) 
        if(len(max3[0]) > 0):
            num_clusters = max3[0][0] + 4 

    model = kms[num_clusters-1]
    
    # plot costs
    plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)

    clusters = model.predict(X)
    plot_clusters(clusters, model.n_clusters, title)
    return clusters, model.n_clusters, cost


import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go

def plot3(feature_indexes, X, clusters, selected_clusters, title = "Cluster", cluster_titles = []):
    clusters_plot = []
    num_clusters = max(clusters)+1
    for i in range(0, num_clusters):
        if len(selected_clusters) > 0 and (i not in selected_clusters):
            continue
        d = X[clusters == i,: ]
        cluster = Scatter3d(
            x=d[:,feature_indexes[0]],
            y=d[:,feature_indexes[1]],
            z=d[:,feature_indexes[2]],
            mode='markers',
            name = cluster_titles[i] if i < len(cluster_titles) else '{}'.format(i),
            marker=dict(
                color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
                size=12,
                line=dict(
                    color='rgb(204, 204, 204)',
                    width=0.0
                ),
                opacity=0.5
            )
        )
        clusters_plot.append(cluster)

    data = Data(clusters_plot)
    bk_color = "rgb(224, 224, 224)"
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0,t=60),
        title=title, 
        height = 1000,
        width = 1000,
        scene=go.Scene(
            xaxis=dict(
                title = features[feature_indexes[0]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
           ),
            yaxis=dict(
                 title = features[feature_indexes[1]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            ),
            zaxis=dict(
                 title = features[feature_indexes[2]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            )
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)

def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, cluster2 = -1):
    clusters_np = np.array(clusters)
    ips_np = np.array(ips)
    ips2 = set(tools.get_ips(id_incident2, cluster2))
    d = {}
    d["Cluster"] = []
    d["Incident"] = []
    d["data"] = []
    percentages = []
    intersections = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Unique from incident {}".format(id_incident))
        cluster_ips = set(ips_np[clusters_np == cluster])
        intersection = len(ips2.intersection(cluster_ips))
        intersections.append(intersection)
        d["data"].append(len(cluster_ips)-intersection)
        if(len(cluster_ips) == 0):
            percentages.append(0)
        else:
            percentages.append(intersection*100.0/len(cluster_ips))
        
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Intersection with incident {}".format(id_incident2))
        d["data"].append(intersections[cluster])
        
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right', 
          title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, cluster2) ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def plot_countries(clusters, num_clusters, sessions, num_countries = 10):
    countries = tools.get_countries()
    ids = np.array([s['id_country'] for s in sessions])
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per cluster
    clusters_np = np.array(clusters)
    d = {}
    d["Cluster"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for cluster in range(0, num_clusters):
        ids_cluster = ids[clusters_np == cluster]
        freqs.append(itemfreq(ids_cluster))
        
    for i in range(0,len(best_countries)):
        for cluster in range(0, num_clusters):
            d["Cluster"].append(cluster)
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[cluster]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p) 
    #print d
    
def plot_ban(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    bans = np.array([s['ban'] for s in sessions])
    d = {}
    d["Cluster"] = []
    d["Ban"] = []
    d["data"] = []
    banned = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Served")
        cluster_total = bans[clusters_np == cluster]
        cluster_banned = cluster_total[cluster_total==1]
        banned.append(cluster_banned.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Banned")
        d["data"].append(banned[cluster])

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right', 
          title = "Banjax Ban" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage

# plot selected clusters from different incidents
"""
features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    "payload_average",#5
    "error_rate",#6
    "request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    "percentage_cons_requests",#10
]
"""
def plot_incidents(id_incidents, features = [
    "html2image_ratio",#3
    "variance_request_interval",#4
    "error_rate",#6
]):
    
    incidents = []
    values = []
    incident_indexes = []
    i = 1
    titles = []
    for id in id_incidents:
        print "Indicent", id, "loading..."
        sessions = tools.get_sessions(id)
        incident = tools.get_incident(id)[0]
        for s in sessions:
            if(s['cluster_index'] != incident['cluster_index']):
                continue
            row = []
            for f in features:
                row.append(s[f])
            incident_indexes.append(i-1) 
            values.append(row)
        titles.append("Incident {}, cluster {}".format(id, incident['cluster_index']))
        i = i + 1

    X = np.array(values)
    incident_indexes = np.array(incident_indexes)
    plot3([0,1,2], X, incident_indexes, -1, "Incident clusters", titles)


Loading BokehJS ...
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:78: DeprecationWarning:

JupyterApp._config_dir_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:340: DeprecationWarning:

EnableNBExtensionApp._config_file_name_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:91: DeprecationWarning:

JupyterApp._data_dir_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:73: DeprecationWarning:

JupyterApp._jupyter_path_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:69: DeprecationWarning:

JupyterApp._log_level_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:97: DeprecationWarning:

JupyterApp._runtime_dir_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/notebook/services/config/manager.py:16: DeprecationWarning:

ConfigManager._config_dir_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/notebook/services/config/manager.py:16: DeprecationWarning:

ConfigManager._config_dir_default is deprecated: use @default decorator instead.

Configuration


In [99]:
#Report Configuration
id_incident = 29

features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    #"payload_average",#5
    #"error_rate",#6
    "request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    #"percentage_cons_requests",#10
]

max_number_of_clusters = 15

For incident 19:

  • features: 3,4,6,8,9
  • no PCA
  • DBSCAN - eps=0.3, min_samples=10

Read & Preprocess


In [100]:
# Reading from Database
incident = None
sessions = None
incident = tools.get_incident(id_incident)
sessions = tools.get_sessions(id_incident)

#tools.disconnect_from_db()

print ("Incident {} loaded:".format(id_incident))
print ("Start: {}".format(incident[0]["start"]))
print ("Stop : {}".format(incident[0]["stop"]))
#print ("Comment : {}".format(incident[0]["comment"]))
values = []
ips = []
for s in sessions:
    row = []
    for f in features:
        row.append(s[f])
    values.append(row)
    ips.append(s["IP"])
X = np.array(values)
print (X.shape)

# normalization 
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)


Incident 29 loaded:
Start: 2016-02-10 21:00:00
Stop : 2016-02-11 01:00:00
(22914, 7)

Read several incidents


In [84]:
#id_incidents = [24,25,26,19,27] # Kotsubynske
#id_incidents = [19] 

id_incidents = [29,30,31,32,33,34] #Bdsmovemenet
#id_incidents = [31,32,33,34] #Bdsmovemenet, last 4 incidents
sessions = []
values = []

for id in id_incidents:
    print "Indicent", id, "loading..."
    s = tools.get_sessions(id)
    sessions = sessions + s
for s in sessions:
    row = []
    for f in features:
        row.append(s[f])
    values.append(row)

X = np.array(values)

std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)

print (X.shape)
print "Done."


Indicent 29 loading...
Indicent 30 loading...
Indicent 31 loading...
Indicent 32 loading...
Indicent 33 loading...
Indicent 34 loading...
(65370, 7)
Done.

PCA Transform


In [130]:
# perform PCA dimensionality reduction

#PCA to 3 dimensions for visualisation
pca = sklearn.decomposition.RandomizedPCA(n_components=3).fit(X)
X = pca.transform(X)

# elbow methos fpr PCA
#clusters, num_clusters, costs_pca = get_best_clustering_model(X, max_number_of_clusters, "PCA")
#print ("Num clusters(PCA):", num_clusters)

#plot3([0,1,2], X, clusters, -1)

DBSCAN Clustering


In [104]:
# DBSCAN clustering
from sklearn.cluster import DBSCAN
from sklearn import metrics

# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=10).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
clusters = db.labels_.astype(int)
#clusters_dbscan = clusters_dbscan.tolist()
clusters = clusters + 1
# Number of clusters in labels, ignoring noise if present.
num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)

plot_clusters(clusters, num_clusters)

print('Estimated number of clusters: %d' % num_clusters)
#plot3([0,1,2], X, clusters, -1)


[1456, 13747, 225, 271, 6255, 477, 74, 21, 20, 24, 18, 48, 72, 50, 11, 10, 20, 11, 14, 13, 15, 14, 19, 10, 10, 9]
Estimated number of clusters: 26

In [107]:
#plot3([1,0,3], X, clusters, -1)
plot3([2,0,1], X, clusters, [1,2])


K-Means Clustering


In [131]:
# K-Means
random.seed(666)
clusters, num_clusters, costs = get_best_clustering_model(X, max_number_of_clusters, "")
print ("Num clusters:", num_clusters)
#plot3([0,1,2], X, clusters, -1)


[3241]
[2939, 302]
[2936, 287, 18]
[276, 2797, 18, 150]
[2682, 264, 18, 128, 149]
[201, 263, 2608, 18, 31, 120]
[2607, 253, 8, 30, 120, 201, 22]
[1877, 253, 8, 30, 171, 768, 112, 22]
[731, 112, 8, 75, 253, 1847, 11, 22, 182]
[736, 14, 70, 4, 1848, 180, 252, 14, 11, 112]
[1760, 14, 252, 202, 48, 118, 110, 11, 4, 708, 14]
[695, 35, 4, 11, 105, 1778, 5, 112, 21, 229, 45, 201]
[743, 6, 229, 44, 619, 110, 104, 21, 35, 11, 3, 190, 1126]
[715, 232, 14, 131, 103, 11, 609, 4, 1139, 30, 13, 187, 25, 28]
[96, 710, 232, 12, 1112, 4, 25, 104, 8, 14, 124, 610, 24, 29, 137]
[731, 112, 8, 75, 253, 1847, 11, 22, 182]
('Num clusters:', 9)

In [133]:
plot3([2,0,3], X, clusters, -1)


Custom Number of Clusters


In [57]:
# custom number of clusters
num_clusters = 6
clusters = get_clustering_model(X, num_clusters)
plot3([2,0,3], X, clusters, -1)


[613, 176, 54, 2, 26, 34]

Double Clustering


In [26]:
target_cluster = 1
num_clusters = 5

X2 = X[clusters == target_cluster]
sessions2 = []
for s,cluster in zip(sessions,clusters):
    if(cluster == target_cluster):
        sessions2.append(s)

clusters2 = get_clustering_model(X2, num_clusters)


[14409, 1173, 1153, 493, 3475]

In [103]:
plot3([2,0,1], X2, clusters2, -1)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-103-fb5a40f74ee5> in <module>()
----> 1 plot3([2,0,1], X2, clusters2, -1)

<ipython-input-98-7354ade7c0fb> in plot3(feature_indexes, X, clusters, selected_clusters, title, cluster_titles)
    184     num_clusters = max(clusters)+1
    185     for i in range(0, num_clusters):
--> 186         if len(selected_clusters) > 0 and (i not in selected_clusters):
    187             continue
    188         d = X[clusters == i,: ]

TypeError: object of type 'int' has no len()

Save Clustering


In [ ]:
# saving clustering
tools.save_clustering(sessions, clusters)

if clusters2 is not None and len(clusters2)>0:
    tools.save_clustering2(sessions2, clusters2)

Save Attack


In [30]:
tools.clear_attack(d_incident)

tools.label_attack(id_incident, 1, [1], [0,2])

Cluster Feature Average


In [85]:
plot_cluster_feature(clusters, 10, X,0)


[0, 0, 0.69113964992407473, -1.222958250655855, 0.69113964992407473, 0, 0, 0, 0, 0]

In [96]:
def box_plot_feature(clusters, num_clusters, X, feature_index):
    traces = []
    for i in range(0,num_clusters):
        traces.append(go.Box(
            y = X[clusters == i][feature_index],
            boxpoints='all',
            jitter=0.5,
            name='{}'.format(i),
            pointpos=-1.8,
            
        ))
        
    data = Data(traces)
    layout = go.Layout(
        showlegend=False,
        height = 900,
        title='Feature {}'.format(features[feature_index]),
        xaxis=go.XAxis(
            showgrid=True,
            showline=True,
            ticks=''
        ),
        yaxis=go.YAxis(
            showline=True,
            ticks='',
            zeroline=True,
            #range = [0,300],
            title = "Value"
        )
    )

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)

In [97]:
box_plot_feature(clusters, 10, X,3)


IP Intersection

IP intersection with anothe incident


In [13]:
# Intersection with another incident(and cluster)
plot_intersection(clusters, num_clusters, id_incident, ips, 34, -1)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

Countries


In [37]:
#num_clusters = 14
#clusters = get_clustering_model(X, num_clusters)

plot_countries(clusters, num_clusters, sessions, 8)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

Banjax


In [54]:
# banjax ban feature
plot_ban(clusters, num_clusters, sessions)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

[127, 0, 581, 0, 346, 1419, 23, 0, 0, 0, 0, 11, 0, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0]
[10.24, 0.0, 23.87, 0.0, 98.58, 98.95, 36.51, 0.0, 0.0, 0.0, 0.0, 1.6, 0.0, 96.82, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 27.27, 0.0, 0.0]
/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

User Agent


In [115]:
#ua
def plot_ua(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    #uas = np.array([s['ua'] == 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)' for s in sessions])
    uas = np.array([ s['ua'].find("WordPress") >= 0 if s['ua'] != None else False  for s in sessions])
    d = {}
    d["Cluster"] = []
    d["UserAgent"] = []
    d["data"] = []
    ua = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["UserAgent"].append("Other")
        cluster_total = uas[clusters_np == cluster]
        cluster_ua = cluster_total[cluster_total==1]
        ua.append(cluster_ua.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_ua.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_ua.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["UserAgent"].append(" Suspected User Agent")
        d["data"].append(ua[cluster])

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='UserAgent',legend='top_right', 
          title = "User agent in clusters" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage

In [116]:
plot_ua(clusters, num_clusters, sessions)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-116-84d6c7a70b59> in <module>()
----> 1 plot_ua(clusters, num_clusters, sessions)

<ipython-input-115-f4ef7cf89e2b> in plot_ua(clusters, num_clusters, sessions)
     34          ylabel = "IP sessions", plot_width=1000, plot_height=600)
     35     bk.show(p)
---> 36     print banned
     37     print percentage

NameError: global name 'banned' is not defined

In [91]:
print len(sessions)


30104

Banjax total


In [45]:
# ban
banned = []
for s in sessions:
    if(s['ban'] == 1):
        banned.append(1)
    else:
        banned.append(0)
banned = np.array(banned)
#print X[banned==1,:]        

plot3([4,1,3], X, banned, -1, "Banned IPs vs Regular IPs")


Calculatig intersection


In [2]:
tools.calculate_all_intersections(19)

Cross-incident clusters


In [74]:
plot_incidents([19,27], features = [
    "request_interval", #1
    #"ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    #"payload_average",#5
    "error_rate",#6
    #"request_depth",#7
    #"request_depth_std",#8
    #"session_length",#9
    #"percentage_cons_requests",#10
])


Indicent 19 loading...
Indicent 27 loading...

In [ ]: