Feature list

Table 1: Basic features of individual TCP connections.

feature name description type
duration length (number of seconds) of the connection continuous
protocol_type type of the protocol, e.g. tcp, udp, etc. discrete
service network service on the destination, e.g., http, telnet, etc. discrete
src_bytes number of data bytes from source to destination continuous
dst_bytes number of data bytes from destination to source continuous
flag normal or error status of the connection discrete
land 1 if connection is from/to the same host/port; 0 otherwise discrete
wrong_fragment number of ''wrong'' fragments continuous
urgent number of urgent packets continuous

Table 2: Content features within a connection suggested by domain knowledge.

feature name description type
hot number of ''hot'' indicators continuous
num_failed_logins number of failed login attempts continuous
logged_in 1 if successfully logged in; 0 otherwise discrete
num_compromised number of ''compromised'' conditions continuous
root_shell 1 if root shell is obtained; 0 otherwise discrete
su_attempted 1 if ''su root'' command attempted; 0 otherwise discrete
num_root number of ''root'' accesses continuous
num_file_creations number of file creation operations continuous
num_shells number of shell prompts continuous
num_access_files number of operations on access control files continuous
num_outbound_cmds number of outbound commands in an ftp session continuous
is_hot_login 1 if the login belongs to the ''hot'' list; 0 otherwise discrete
is_guest_login 1 if the login is a ''guest''login; 0 otherwise discrete

In [1]:
%matplotlib inline
#%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
import constants
import utils

warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
data_10_percent = 'kddcup.data_10_percent'
data_full = 'kddcup.data'
data = pd.read_csv(data_10_percent, names=constants.names)

# Remove Traffic features computed using a two-second time window
data.drop(constants.traffic_features, inplace=True, axis=1)

# Categorical features to numeric labels

from sklearn import preprocessing

le_dicts = {}

for categorical_name in constants.categorical_names:
    le = preprocessing.LabelEncoder()
    le.fit(data[categorical_name])
    le_dicts[categorical_name] = dict(zip(le.transform(le.classes_), le.classes_))
#     print(categorical_name, ':', le_dicts[categorical_name])
    data[categorical_name + '_num'] = le.fit_transform(data[categorical_name])

data['label_binary_num'] = data.label.apply(lambda label: 1 if label == 'normal.' else -1)
data['label_four'] = data.label.apply(lambda label: constants.label_to_four_attack_class[label])
data['label_four_num'] = data.label_four.apply(lambda label: constants.five_classes_to_num[label])
data.drop(constants.categorical_names + ['label', 'label_four'], axis=1, inplace=True)
data.shape


Out[2]:
(494021, 24)

Drop all DDOS attack data


In [3]:
data = data[data['label_four_num'] != 4]
data.shape


Out[3]:
(102563, 24)

In [4]:
pd.value_counts(data['label_four_num'], sort=True)


Out[4]:
0    97278
1     4107
2     1126
3       52
Name: label_four_num, dtype: int64

In [5]:
# Normalize data
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = data.copy(deep=True)
data_scaled[constants.names_to_normalize] = min_max_scaler.fit_transform(data[constants.names_to_normalize])

In [6]:
#test-train division
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, test_size=0.1, random_state=42)

Y_train_bin = X_train.label_binary_num
Y_train_many_classes = X_train.label_four_num

Y_test_bin = X_test.label_binary_num
Y_test_many_classes = X_test.label_four_num

X_train.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)
X_test.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)

In [7]:
#test-train division scaled  + without features without changes
data_scaled.drop(['label_binary_num', 'label_four_num'] +\
                 constants.names_without_changes,
                 axis=1)

X_train_scaled, X_test_scaled = train_test_split(data_scaled,
                                                 test_size=0.1,
                                                 random_state=42)

In [8]:
# #KMeans for not scaled data 
# from sklearn.cluster import KMeans
# from sklearn.metrics import confusion_matrix, accuracy_score

# kmeans = KMeans(n_clusters=2,
#                 max_iter=500,
#                 random_state=42)

# utils.model_validation(kmeans, X_train, X_test, Y_train_bin, Y_test_bin)

In [9]:
# One class SVM
# Not scaled data

from sklearn.svm import OneClassSVM

model = OneClassSVM(nu=0.1,
                    gamma=0.00005,
                    kernel='rbf')

utils.model_validation(model,
                       X_train,
                       X_test,
                       Y_train_bin,
                       Y_test_bin,
                       True)


Train: 
[[  281  4457]
 [ 9508 78060]]
accuracy:  0.848709726345
f1 score:  0.917893994179
Test: 
[[  33  514]
 [1039 8671]]
accuracy:  0.848591206006
f1 score:  0.917808944165

In [10]:
# One class SVM
# scaled data

from sklearn.svm import OneClassSVM

model = OneClassSVM(nu=0.1,
                    gamma=0.00005,
                    kernel='rbf')

utils.model_validation(model,
                       X_train_scaled,
                       X_test_scaled,
                       Y_train_bin,
                       Y_test_bin,
                       True)


Train: 
[[  125  4613]
 [ 1265 86303]]
accuracy:  0.936320499209
f1 score:  0.96706707604
Test: 
[[  16  531]
 [ 131 9579]]
accuracy:  0.935458711124
f1 score:  0.966599394551