KDD Cup 1999

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

Feature list

Table 1: Basic features of individual TCP connections.

feature name	description	type
duration	length (number of seconds) of the connection	continuous
protocol_type	type of the protocol, e.g. tcp, udp, etc.	discrete
service	network service on the destination, e.g., http, telnet, etc.	discrete
src_bytes	number of data bytes from source to destination	continuous
dst_bytes	number of data bytes from destination to source	continuous
flag	normal or error status of the connection	discrete
land	1 if connection is from/to the same host/port; 0 otherwise	discrete
wrong_fragment	number of ''wrong'' fragments	continuous
urgent	number of urgent packets	continuous

Table 2: Content features within a connection suggested by domain knowledge.

feature name	description	type
hot	number of ''hot'' indicators	continuous
num_failed_logins	number of failed login attempts	continuous
logged_in	1 if successfully logged in; 0 otherwise	discrete
num_compromised	number of ''compromised'' conditions	continuous
root_shell	1 if root shell is obtained; 0 otherwise	discrete
su_attempted	1 if ''su root'' command attempted; 0 otherwise	discrete
num_root	number of ''root'' accesses	continuous
num_file_creations	number of file creation operations	continuous
num_shells	number of shell prompts	continuous
num_access_files	number of operations on access control files	continuous
num_outbound_cmds	number of outbound commands in an ftp session	continuous
is_hot_login	1 if the login belongs to the ''hot'' list; 0 otherwise	discrete
is_guest_login	1 if the login is a ''guest''login; 0 otherwise	discrete



In [1]:

    
%matplotlib inline
#%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
import constants
import utils

warnings.filterwarnings('ignore')
np.random.seed(42)



In [2]:

    
data_10_percent = 'kddcup.data_10_percent'
data_full = 'kddcup.data'
data = pd.read_csv(data_10_percent, names=constants.names)

# Remove Traffic features computed using a two-second time window
data.drop(constants.traffic_features, inplace=True, axis=1)

# Categorical features to numeric labels

from sklearn import preprocessing

le_dicts = {}

for categorical_name in constants.categorical_names:
    le = preprocessing.LabelEncoder()
    le.fit(data[categorical_name])
    le_dicts[categorical_name] = dict(zip(le.transform(le.classes_), le.classes_))
#     print(categorical_name, ':', le_dicts[categorical_name])
    data[categorical_name + '_num'] = le.fit_transform(data[categorical_name])

data['label_binary_num'] = data.label.apply(lambda label: 1 if label == 'normal.' else -1)
data['label_four'] = data.label.apply(lambda label: constants.label_to_four_attack_class[label])
data['label_four_num'] = data.label_four.apply(lambda label: constants.five_classes_to_num[label])
data.drop(constants.categorical_names + ['label', 'label_four'], axis=1, inplace=True)
data.shape









    Out[2]:





(494021, 24)

Drop all DDOS attack data



In [3]:

    
data = data[data['label_four_num'] != 4]
data.shape









    Out[3]:





(102563, 24)



In [4]:

    
pd.value_counts(data['label_four_num'], sort=True)









    Out[4]:





0    97278
1     4107
2     1126
3       52
Name: label_four_num, dtype: int64



In [5]:

    
# Normalize data
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = data.copy(deep=True)
data_scaled[constants.names_to_normalize] = min_max_scaler.fit_transform(data[constants.names_to_normalize])



In [6]:

    
#test-train division
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, test_size=0.1, random_state=42)

Y_train_bin = X_train.label_binary_num
Y_train_many_classes = X_train.label_four_num

Y_test_bin = X_test.label_binary_num
Y_test_many_classes = X_test.label_four_num

X_train.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)
X_test.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)



In [7]:

    
#test-train division scaled  + without features without changes
data_scaled.drop(['label_binary_num', 'label_four_num'] +\
                 constants.names_without_changes,
                 axis=1)

X_train_scaled, X_test_scaled = train_test_split(data_scaled,
                                                 test_size=0.1,
                                                 random_state=42)



In [8]:

    
# #KMeans for not scaled data 
# from sklearn.cluster import KMeans
# from sklearn.metrics import confusion_matrix, accuracy_score

# kmeans = KMeans(n_clusters=2,
#                 max_iter=500,
#                 random_state=42)

# utils.model_validation(kmeans, X_train, X_test, Y_train_bin, Y_test_bin)



In [9]:

    
# One class SVM
# Not scaled data

from sklearn.svm import OneClassSVM

model = OneClassSVM(nu=0.1,
                    gamma=0.00005,
                    kernel='rbf')

utils.model_validation(model,
                       X_train,
                       X_test,
                       Y_train_bin,
                       Y_test_bin,
                       True)









    



Train: 
[[  281  4457]
 [ 9508 78060]]
accuracy:  0.848709726345
f1 score:  0.917893994179
Test: 
[[  33  514]
 [1039 8671]]
accuracy:  0.848591206006
f1 score:  0.917808944165



In [10]:

    
# One class SVM
# scaled data

from sklearn.svm import OneClassSVM

model = OneClassSVM(nu=0.1,
                    gamma=0.00005,
                    kernel='rbf')

utils.model_validation(model,
                       X_train_scaled,
                       X_test_scaled,
                       Y_train_bin,
                       Y_test_bin,
                       True)









    



Train: 
[[  125  4613]
 [ 1265 86303]]
accuracy:  0.936320499209
f1 score:  0.96706707604
Test: 
[[  16  531]
 [ 131 9579]]
accuracy:  0.935458711124
f1 score:  0.966599394551