Analytics

Bothound project

Initialization


In [1]:
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq

# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram

# enabling folding extension. Run it once.
ext_require_path = 'usability/codefolding/main'
from notebook.nbextensions import EnableNBExtensionApp
if hasattr(EnableNBExtensionApp(), 'enable_nbextension'):
    EnableNBExtensionApp().enable_nbextension(ext_require_path)
else:
    from notebook.nbextensions import enable_nbextension
    enable_nbextension('notebook', ext_require_path)

import yaml
from bothound_tools import BothoundTools

color_set = [
    [0, 0, 255],      #Blue
    [255, 0, 0],      #Red
    [0, 255, 0],      #Green
    [255, 255, 0],    #Yellow
    [255, 0, 255],    #Magenta
    [255, 128, 128],  #Pink
    [128, 128, 128],  #Gray
    [128, 0, 0],      #Brown
    [255, 128, 0],    #Orange
]

stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()

def get_palette(N=5):
    result = []
    for x in range(N):
        s = color_set[x % len(color_set)]
        result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
    return result
palette = get_palette(80)

def plot_costs(costs, num_clusters, title):
    KK = range(1,len(costs)+1)

    # elbow curve
    kIdx = num_clusters      
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    mrk = 'os^p<dvh8>+x.'

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, costs, 'b*-')
    ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14, 
        markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    #ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within sum of squeres')
    plt.title(title)
    
def plot_clusters(clusters, num_clusters,title="Histogram"):
    sizes = [0]*num_clusters
    for i in clusters: 
        if(i >= 0) :
            if (i >= num_clusters):
                print i
            sizes[i] = sizes[i]+1
    print (sizes)
    
      #plot histogramm
    left = [] 
    for i in range(len(sizes)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,sizes, color = palette)            
    
    
def get_clustering_model(X, num_clusters):
    model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
    model.fit(X)
    
    clusters = model.predict(X)
    plot_clusters(clusters, num_clusters)
    return clusters

def get_best_clustering_model(X, max_number_of_clusters, title):
    cost = []
    KK = range(1,max_number_of_clusters+1)
    kms = []
    # calculate all the clustering and cost
    for no_of_clusters in KK:
        km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
        km.fit(X)
        kms.append(km)

        sizes = [0]*no_of_clusters
        for i in km.predict(X): 
            if(i >= no_of_clusters):
                print i
            sizes[i] = sizes[i]+1
        print (sizes)

        cost.append(km.inertia_)

    # calculate first derivative
    derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
    #print "d1", derivative1

    # calculate second derivative
    derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
    #print "d2", derivative2

    max2 = argrelextrema(np.argsort(derivative2), np.less) 
    num_clusters = 4 
    #print "max2", max2
    if(len(max2[0]) > 0):
        num_clusters = max2[0][0] + 3
    else:
        # calculate third derivative
        derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
        #print derivative3

        max3 = argrelextrema(np.argsort(derivative3), np.greater) 
        if(len(max3[0]) > 0):
            num_clusters = max3[0][0] + 4 

    model = kms[num_clusters-1]
    
    # plot costs
    plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)

    clusters = model.predict(X)
    plot_clusters(clusters, model.n_clusters, title)
    return clusters, model.n_clusters, cost


import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go

def plot3(feature_indexes, X, clusters, selected_cluster, title = "Cluster"):
    clusters_plot = []
    num_clusters = max(clusters)+1
    for i in range(0, num_clusters):
        d = X[clusters == i,: ]
        cluster = Scatter3d(
            x=d[:,feature_indexes[0]],
            y=d[:,feature_indexes[1]],
            z=d[:,feature_indexes[2]],
            mode='markers',
            name = "All traffic" if i == 0 else "{} {}".format(title, i),
            marker=dict(
                color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
                size=12,
                line=dict(
                    color='rgb(204, 204, 204)',
                    width=0.0
                ),
                opacity=0.2
            )
        )
        clusters_plot.append(cluster)

    data = Data(clusters_plot)
    bk_color = "rgb(224, 224, 224)"
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0,t=60),
        title='', 
        height = 1000,
        width = 1000,
        legend=dict(
            #x=0,
            #y=1,
            #traceorder='normal',
            font=dict(
                family='sans-serif',
                size=16,
                color='#000'
            ),
            bgcolor='#E2E2E2',
            bordercolor='#FFFFFF',
            borderwidth=2
        ),
        scene=go.Scene(
            xaxis=dict(
                title = features[feature_indexes[0]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
           ),
            yaxis=dict(
                 title = features[feature_indexes[1]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            ),
            zaxis=dict(
                 title = features[feature_indexes[2]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            )
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)


def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, cluster2 = -1):
    clusters_np = np.array(clusters)
    ips_np = np.array(ips)
    ips2 = set(tools.get_ips(id_incident2, cluster2))
    d = {}
    d["Cluster"] = []
    d["Incident"] = []
    d["data"] = []
    percentages = []
    intersections = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Unique from incident {}".format(id_incident))
        cluster_ips = set(ips_np[clusters_np == cluster])
        intersection = len(ips2.intersection(cluster_ips))
        intersections.append(intersection)
        d["data"].append(len(cluster_ips)-intersection)
        if(len(cluster_ips) == 0):
            percentages.append(0)
        else:
            percentages.append(intersection*100.0/len(cluster_ips))
        
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Intersection with incident {}".format(id_incident2))
        d["data"].append(intersections[cluster])
        
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right', 
          title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, cluster2) ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def plot_countries(clusters, num_clusters, sessions, num_countries = 10):
    countries = tools.get_countries()
    ids = np.array([s['id_country'] for s in sessions])
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per cluster
    clusters_np = np.array(clusters)
    d = {}
    d["Cluster"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for cluster in range(0, num_clusters):
        ids_cluster = ids[clusters_np == cluster]
        freqs.append(itemfreq(ids_cluster))
        
    for i in range(0,len(best_countries)):
        for cluster in range(0, num_clusters):
            d["Cluster"].append(cluster)
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[cluster]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    
def plot_ban(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    bans = np.array([s['ban'] for s in sessions])
    d = {}
    d["Cluster"] = []
    d["Ban"] = []
    d["data"] = []
    banned = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Served")
        cluster_total = bans[clusters_np == cluster]
        cluster_banned = cluster_total[cluster_total==1]
        banned.append(cluster_banned.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Banned")
        d["data"].append(banned[cluster])

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right', 
          title = "Banjax Ban" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage
    

def get_countries(incidents, num_countries = 25):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append({ "count": best_countries[i][1], "name": c['name'], "id": c["id"]})
        #codes.append([best_countries[i][1], c['name']])
    #for c in codes:
    #    print c
    return codes

def get_countries_count(id_incidents, attack):
    sql_where = "where id_incident in ("
    for id in id_incidents:
        sql_where = sql_where + "{},".format(id)
    sql_where = sql_where[:-1]
    sql_where += ")"
    
    if(attack > 0):
        sql_where += " and attack = {}".format(attack)
    else:
        sql_where += " and attack > 0"
    #print sql_where
        

    tools.cur.execute("select distinctrow IP, id_country from sessions " + sql_where)
    countries = tools.cur.fetchall()
    res_dict = {}
    for c in countries:
        id = c["id_country"]
        if id in res_dict:
            res_dict[id] += 1
        else:
            res_dict[id] = 1
    res = []
    for key, value in res_dict.iteritems():
        temp = [key,value]
        res.append(temp)
    
    res = sorted(res, key=lambda x: x[1], reverse=True) 
    return res
    
    
def plot_attack_countries(id_incidents, num_countries = 10):
    
    attacks = tools.get_attacks(id_incidents)
    
    countries = tools.get_countries()
    names = {}
    for c in countries:
        names[c["id"]] = c["name"]
    
    all_counts = get_countries_count(id_incidents, -1)
    
    # calculate best countries count per attack
    best_countries = []

    n = num_countries if num_countries < len(all_counts) else len(all_counts)
    for i in range(0,n):
        best_countries.append(all_counts[i][0])
        
    freqs= []
    for attack in attacks:
        cur_countries = get_countries_count(id_incidents, attack["id"])
        n = num_countries if num_countries < len(cur_countries) else len(cur_countries)
        for i in range(0,n):
            if (cur_countries[i][0] not in best_countries):
                best_countries.append(cur_countries[i][0])
        freqs.append(cur_countries)
            
    d = {}
    d["Attack"] = []
    d["Country"] = []
    d["data"] = []

    for i in range(0,len(best_countries)):
        for index_attack in range(0, len(attacks)):
            d["Attack"].append(attacks[index_attack]["id"])
            d["Country"].append(names[best_countries[i]])
            
            exists = False

            for k in range(0,num_countries):
                f = freqs[index_attack][k]
                if (f[0] == best_countries[i]):
                    d["data"].append(f[1])
                    exists = True
                    break
                
            """
            for f in freqs[index_attack]:
                if (f[0] == best_countries[i]):
                    d["data"].append(f[1])
                    exists = True
                    break
            """
            if (not exists) :
                d["data"].append(0)

    for index_attack in range(0, len(attacks)):
        d["Attack"].append(attacks[index_attack]["id"])
        d["Country"].append("Other")
        v = 0
        for k in range(0, num_countries):
            v += freqs[index_attack][k][1]
        d["data"].append(v)
            
    df=pd.DataFrame(d)
    p=Bar(df,label='Attack',values='data',stack='Country',legend='top_center', 
          title = "Countries" ,
         ylabel = "#IP", plot_width=1000, plot_height=600)
    bk.show(p)  
            
def plot_incident_countries(incidents, num_countries = 10):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per incident
    d = {}
    d["Incident"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for incident in incidents:
        ids_incident = []
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids_incident.append(s['id_country'])
        freqs.append(itemfreq(ids_incident))
        
    for i in range(0,len(best_countries)):
        for index_incident in range(0, len(incidents)):
            d["Incident"].append(incidents[index_incident]["id"])
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[index_incident]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def bar_plot(data, x_label, y_label, title):
   
    keys = []
    for d in data:
        keys = keys + d["values"].keys()
    keys = set(keys)

    d = {}
    d[x_label] = []
    d["legend"] = []
    d["data"] = []
    
    for x in data:    
        for key in keys:
            d[x_label].append(x["x"])
            d["legend"].append(key)
            d["data"].append(x["values"][key] if key in x["values"] else 0)
    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack="legend",legend='top_right', 
        title = title,
        ylabel = y_label, plot_width=1000, plot_height=1000)
    bk.show(p)


/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:66: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  inline backend."""
/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:71: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  'retina', 'jpeg', 'svg', 'pdf'.""")
/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:85: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  use `figure_formats` instead)""")
/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:95: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  """
/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:114: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  """)
/usr/local/lib/python2.7/dist-packages/ipykernel/pylab/config.py:44: DeprecationWarning: InlineBackend._config_changed is deprecated: use @observe and @unobserve instead.
  def _config_changed(self, name, old, new):
/usr/local/lib/python2.7/dist-packages/traitlets/traitlets.py:770: DeprecationWarning: A parent of InlineBackend._config_changed has adopted the new @observe(change) API
  clsname, change_or_name), DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:98: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.
  def _formatters_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):
Loading BokehJS ...
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:106: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help="""Generate default config file."""
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:110: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help="Specify a config file to load."
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:118: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help="""Full path of a config file.""",
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:122: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help="""Answer yes to any prompts."""
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:286: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  overwrite = Bool(False, config=True, help="Force overwrite of existing files")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:287: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  symlink = Bool(False, config=True, help="Create symlinks instead of copying files")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:288: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  user = Bool(False, config=True, help="Whether to do a user install")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:289: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  prefix = Unicode('', config=True, help="Installation prefix")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:290: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  nbextensions_dir = Unicode('', config=True, help="Full path to nbextensions dir (probably use prefix or user)")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:291: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  destination = Unicode('', config=True, help="Destination for the copy or symlink")
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:293: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help="Verbosity level"
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:333: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help=("Which config section to add the extension to. "
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:364: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  help=("Which config section to remove the extension from. "
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:78: DeprecationWarning: JupyterApp._config_dir_default is deprecated: use @default decorator instead.
  def _config_dir_default(self):
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:340: DeprecationWarning: EnableNBExtensionApp._config_file_name_default is deprecated: use @default decorator instead.
  def _config_file_name_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:91: DeprecationWarning: JupyterApp._data_dir_default is deprecated: use @default decorator instead.
  def _data_dir_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:73: DeprecationWarning: JupyterApp._jupyter_path_default is deprecated: use @default decorator instead.
  def _jupyter_path_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:69: DeprecationWarning: JupyterApp._log_level_default is deprecated: use @default decorator instead.
  def _log_level_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:97: DeprecationWarning: JupyterApp._runtime_dir_default is deprecated: use @default decorator instead.
  def _runtime_dir_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:78: DeprecationWarning: JupyterApp._config_dir_default is deprecated: use @default decorator instead.
  def _config_dir_default(self):
/usr/local/lib/python2.7/dist-packages/notebook/nbextensions.py:340: DeprecationWarning: EnableNBExtensionApp._config_file_name_default is deprecated: use @default decorator instead.
  def _config_file_name_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:91: DeprecationWarning: JupyterApp._data_dir_default is deprecated: use @default decorator instead.
  def _data_dir_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:73: DeprecationWarning: JupyterApp._jupyter_path_default is deprecated: use @default decorator instead.
  def _jupyter_path_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:69: DeprecationWarning: JupyterApp._log_level_default is deprecated: use @default decorator instead.
  def _log_level_default(self):
/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py:97: DeprecationWarning: JupyterApp._runtime_dir_default is deprecated: use @default decorator instead.
  def _runtime_dir_default(self):
/usr/local/lib/python2.7/dist-packages/notebook/services/config/manager.py:15: DeprecationWarning: metadata {'config': True} was set from the constructor.  Metadata should be set using the .tag() method, e.g., Int().tag(key1='value1', key2='value2')
  config_dir = Unicode(config=True)
/usr/local/lib/python2.7/dist-packages/notebook/services/config/manager.py:16: DeprecationWarning: ConfigManager._config_dir_default is deprecated: use @default decorator instead.
  def _config_dir_default(self):
/usr/local/lib/python2.7/dist-packages/notebook/services/config/manager.py:16: DeprecationWarning: ConfigManager._config_dir_default is deprecated: use @default decorator instead.
  def _config_dir_default(self):
bothound_tools.py:37: Warning: Can't create database 'bothound'; database exists
  self.cur.execute(sql)
bothound_tools.py:46: Warning: Table 'attacks' already exists
  self.cur.execute("create table IF NOT EXISTS attacks (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:51: Warning: Table 'incidents' already exists
  self.cur.execute("create table IF NOT EXISTS incidents (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:67: Warning: Table 'sessions' already exists
  self.cur.execute("create table IF NOT EXISTS sessions (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:96: Warning: Table 'clusters' already exists
  self.cur.execute("create table IF NOT EXISTS clusters (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:104: Warning: Table 'deflectees' already exists
  self.cur.execute("create table IF NOT EXISTS deflectees (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:110: Warning: Table 'countries' already exists
  self.cur.execute("create table IF NOT EXISTS countries (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:116: Warning: Table 'intersections' already exists
  self.cur.execute("create table IF NOT EXISTS intersections (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:127: Warning: Table 'user_agents' already exists
  self.cur.execute("create table IF NOT EXISTS user_agents (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:142: Warning: Table 'session_user_agent' already exists
  self.cur.execute("create table IF NOT EXISTS session_user_agent (id INT NOT NULL AUTO_INCREMENT, "
bothound_tools.py:153: Warning: Table 'encryption' already exists
  self.cur.execute("create table IF NOT EXISTS encryption (id INT NOT NULL AUTO_INCREMENT, "
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:92: DeprecationWarning:

DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:669: DeprecationWarning:

PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.

Configuration


In [2]:
# Choose incidents to explore
id_incidents = [50,51,52,53,54]

Read Data


In [3]:
# Reading from Database
incidents = []
for id in id_incidents:
    print "Indicent", id, "loading..."
    incident = {}
    incident["id"] = id
    incident["sessions"] = tools.get_sessions(id)
    incident["incident"] = tools.get_incident(id)[0]
    incidents.append(incident)
    print "total sessions", len(incident['sessions'])
print "Done."


Indicent 50 loading...
total sessions 5571
Indicent 51 loading...
total sessions 21763
Indicent 52 loading...
total sessions 2717
Indicent 53 loading...
total sessions 5571
Indicent 54 loading...
total sessions 86983
Done.

Attacks Summary


In [4]:
tools.incidents_summary(id_incidents)
attacks = tools.get_attacks(id_incidents) # show attack count
for a in attacks:
    print "Attack {} = {} ips".format(a["id"], a["count"] )


Incident 50, num IPs = 5571, num Bots = 5324
Incident 51, num IPs = 17222, num Bots = 12224
Incident 52, num IPs = 2430, num Bots = 1581
Incident 53, num IPs = 5571, num Bots = 4953
Incident 54, num IPs = 78743, num Bots = 55286
Attack 50 = 5324 ips
Attack 51 = 12224 ips
Attack 52 = 1581 ips
Attack 53 = 4953 ips
Attack 54 = 55286 ips

Countries by attack


In [40]:
plot_attack_countries(id_incidents, num_countries = 5)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

Countries by Incident


In [41]:
plot_incident_countries(incidents, num_countries = 5)
get_countries(incidents)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

Out[41]:
[{'count': 64301, 'id': 2L, 'name': 'United States'},
 {'count': 3540, 'id': 42L, 'name': 'Canada'},
 {'count': 3023, 'id': 15L, 'name': 'United Kingdom'},
 {'count': 1482, 'id': 8L, 'name': 'Germany'},
 {'count': 992, 'id': 20L, 'name': 'France'},
 {'count': 940, 'id': 5L, 'name': 'Netherlands'},
 {'count': 763, 'id': 36L, 'name': 'Japan'},
 {'count': 660, 'id': 60L, 'name': 'Australia'},
 {'count': 652, 'id': 24L, 'name': 'Sweden'},
 {'count': 606, 'id': 53L, 'name': 'China'},
 {'count': 488, 'id': 16L, 'name': 'Brazil'},
 {'count': 464, 'id': 88L, 'name': 'Singapore'},
 {'count': 435, 'id': 33L, 'name': 'Ireland'},
 {'count': 392, 'id': 4L, 'name': 'Italy'},
 {'count': 391, 'id': 11L, 'name': 'Russian Federation'},
 {'count': 327, 'id': 18L, 'name': 'Poland'},
 {'count': 324, 'id': 29L, 'name': 'Spain'},
 {'count': 259, 'id': 9L, 'name': 'Romania'},
 {'count': 243, 'id': 58L, 'name': None},
 {'count': 228, 'id': 51L, 'name': 'Norway'},
 {'count': 204, 'id': 10L, 'name': 'India'},
 {'count': 204, 'id': 108L, 'name': 'South Africa'},
 {'count': 191, 'id': 34L, 'name': 'Mexico'},
 {'count': 179, 'id': 55L, 'name': 'Denmark'},
 {'count': 169, 'id': 21L, 'name': 'Indonesia'}]

User Agents


In [42]:
def plot_user_agents(id_incidents):
    data = []

    for id in id_incidents:
        v = {}
        sql = "select " \
        "count(user_agents.ua) as ua_count,"\
        "user_agents.ua,"\
        "user_agents.device_family "\
        "from sessions, session_user_agent, user_agents "\
        "where sessions.id = session_user_agent.id_session "\
        "and user_agents.id = session_user_agent.id_user_agent "\
        "and sessions.id_incident = {} "\
        "and sessions.attack >0 "\
        "group by user_agents.ua "\
        "order by ua_count desc".format(id)
        #print sql
        tools.cur.execute(sql)
        count = 0
        for elem in tools.cur.fetchall():
            count = count + 1
            if(count > 10):
                break
            v[elem["ua"]] = elem["ua_count"]
        
        data.append({"x":id, "values" : v if count > 0 else {"1":0} })
    
    bar_plot(data, "Incident", "UA portion", "User Agents distribution")

In [22]:
plot_user_agents(id_incidents)


/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead

/usr/local/lib/python2.7/dist-packages/bokeh/core/properties.py:453: DeprecationWarning:

Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:29: DeprecationWarning:

Comm._iopub_socket_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:24: DeprecationWarning:

Comm._kernel_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:52: DeprecationWarning:

Comm._comm_id_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:32: DeprecationWarning:

Comm._session_default is deprecated: use @default decorator instead.

/usr/local/lib/python2.7/dist-packages/ipykernel/comm/comm.py:41: DeprecationWarning:

Comm._topic_default is deprecated: use @default decorator instead.

Hit rate


In [6]:
# hit rate
hit_rate = []
hit_rate_ua = []
#ua = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
for incident in incidents:
    for s in incident['sessions']:
        v = s['request_interval']
        #if(v == 1800):
        #    v = 60
        #if(v != 0):
        #v = 60.0/v
        #if(s['ua'] == ua) :
        if(s['attack'] > 0) :
            hit_rate_ua.append(v)
        else:
            hit_rate.append(v)
            
trace_other = go.Box(
    y = hit_rate,
    boxpoints='all',
    jitter=0.5,
    name='Others',
    pointpos=-1.8
)

trace_ua = {}
trace_ua = go.Box(
    y = hit_rate_ua,
    boxpoints='all',
    jitter=0.5,
    name='Bots IPs',
    pointpos=-1.8
)

data = Data([trace_other, trace_ua])
layout = go.Layout(
    showlegend=False,
    height = 900,
    title='Hit rate of bots',
    xaxis=go.XAxis(
        showgrid=True,
        showline=True,
        ticks=''
    ),
    yaxis=go.YAxis(
        showline=True,
        ticks='',
        zeroline=True,
        range = [0,300],
        title = "Hit rate/minute"
    )
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)


Attacks Scatter Plot


In [44]:
# Total attack
features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    "payload_average",#5
    "error_rate",#6
    "request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    "percentage_cons_requests",#10
]

values = []
incident_indexes = []
i = 1
for incident in incidents:
    for s in incident['sessions']:
        
        row = []
        for f in features:
            row.append(s[f])
            
        row.append(s['attack'])
        if(s['attack'] == 0):
            incident_indexes.append(0) 
        else:
            incident_indexes.append(1) 
        
        values.append(row)
    i = i + 1
    
X = np.array(values)
incident_indexes = np.array(incident_indexes)
X.shape


Out[44]:
(122605, 11)

In [45]:
plot3([3,2,5], X, incident_indexes, -1, "Attack ")


Attack metrics


In [46]:
tools.calculate_attack_metrics(id_incidents)


__________ Botnet 50:
Session length = 225.808414726 sec
Html/image ratio = 0.0655136530299
Payload average = 72014.0828325
Hit rate = 5.72711129395 /minute

__________ Botnet 51:
Session length = 2260.0294895 sec
Html/image ratio = 4.4372941316
Payload average = 27726.7115037
Hit rate = 0.127094969278 /minute

__________ Botnet 52:
Session length = 1200.23036952 sec
Html/image ratio = 1.16912557786
Payload average = 4944.41512702
Hit rate = 1.75466973781 /minute

__________ Botnet 53:
Session length = 1667.11588936 sec
Html/image ratio = 0.105296293073
Payload average = 47019.6511205
Hit rate = 7.14966154761 /minute

__________ Botnet 54:
Session length = 155.662892443 sec
Html/image ratio = 6.23702323265
Payload average = 35674.5233139
Hit rate = 5.40097611411 /minute

Attack similarity


In [48]:
# Attack similarities
tools.calculate_distances(
    id_incident = 50, # incident to explore
    id_attack = 50, # attack to explore
    id_incidents = [50,51,52,53,54], # incidents to compare with
    features = [] # specify the features. Use all features if empty
)


#######################  Distance calculator
Target indicent =  50
Target attack =  50
Target cluster index  1 =  -1
Target cluster index  2 =  -1
Incidents =  [50, 51, 52, 53, 54]
Features =  ['request_interval', 'ua_change_rate', 'html2image_ratio', 'variance_request_interval', 'payload_average', 'error_rate', 'request_depth', 'request_depth_std', 'session_length', 'percentage_cons_requests']
{'distance': 52051072270126.281, 'incident': 54, 'attack': 54}
{'distance': 95.238524294124119, 'incident': 53, 'attack': 53}
{'distance': 64.708204995355572, 'incident': 51, 'attack': 51}
{'distance': 18.185247093404168, 'incident': 52, 'attack': 52}
{'distance': 0.0, 'incident': 50, 'attack': 50}

Common IPs


In [54]:
# common ips with other attacks
tools.calculate_common_ips(
    incidents1 = [50], # incidents to explore
    id_attack = -1, # attack to explore(use -1 for all attacks)
    incidents2 = [50,51,52,53,54] # incidents to compare with
)


Intersection with incidents:
[50, 51, 52, 53, 54]

========================== Attack 50:
Num IPs in the attack 5324:

__________ Incident 50:
Num IPs in the incident 5324:
# identical   IPs: 5324
% of attack   IPs: 100.00%
% of incident IPs: 100.00%

__________ Incident 53:
Num IPs in the incident 4953:
# identical   IPs: 2878
% of attack   IPs: 54.06%
% of incident IPs: 58.11%

__________ Incident 51:
Num IPs in the incident 12224:
# identical   IPs: 226
% of attack   IPs: 4.24%
% of incident IPs: 1.85%

__________ Incident 52:
Num IPs in the incident 1581:
# identical   IPs: 3
% of attack   IPs: 0.06%
% of incident IPs: 0.19%

__________ Incident 54:
Num IPs in the incident 55286:
# identical   IPs: 4
% of attack   IPs: 0.08%
% of incident IPs: 0.01%

In [ ]:


In [ ]: