feature name | description | type |
---|---|---|
duration | length (number of seconds) of the connection | continuous |
protocol_type | type of the protocol, e.g. tcp, udp, etc. | discrete |
service | network service on the destination, e.g., http, telnet, etc. | discrete |
src_bytes | number of data bytes from source to destination | continuous |
dst_bytes | number of data bytes from destination to source | continuous |
flag | normal or error status of the connection | discrete |
land | 1 if connection is from/to the same host/port; 0 otherwise | discrete |
wrong_fragment | number of ''wrong'' fragments | continuous |
urgent | number of urgent packets | continuous |
feature name | description | type |
---|---|---|
hot | number of ''hot'' indicators | continuous |
num_failed_logins | number of failed login attempts | continuous |
logged_in | 1 if successfully logged in; 0 otherwise | discrete |
num_compromised | number of ''compromised'' conditions | continuous |
root_shell | 1 if root shell is obtained; 0 otherwise | discrete |
su_attempted | 1 if ''su root'' command attempted; 0 otherwise | discrete |
num_root | number of ''root'' accesses | continuous |
num_file_creations | number of file creation operations | continuous |
num_shells | number of shell prompts | continuous |
num_access_files | number of operations on access control files | continuous |
num_outbound_cmds | number of outbound commands in an ftp session | continuous |
is_hot_login | 1 if the login belongs to the ''hot'' list; 0 otherwise | discrete |
is_guest_login | 1 if the login is a ''guest''login; 0 otherwise | discrete |
In [1]:
%matplotlib inline
#%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
import constants
import utils
warnings.filterwarnings('ignore')
np.random.seed(42)
In [2]:
data_10_percent = 'kddcup.data_10_percent'
data_full = 'kddcup.data'
data = pd.read_csv(data_10_percent, names=constants.names)
# Remove Traffic features computed using a two-second time window
data.drop(constants.traffic_features, inplace=True, axis=1)
# Categorical features to numeric labels
from sklearn import preprocessing
le_dicts = {}
for categorical_name in constants.categorical_names:
le = preprocessing.LabelEncoder()
le.fit(data[categorical_name])
le_dicts[categorical_name] = dict(zip(le.transform(le.classes_), le.classes_))
# print(categorical_name, ':', le_dicts[categorical_name])
data[categorical_name + '_num'] = le.fit_transform(data[categorical_name])
data['label_binary_num'] = data.label.apply(lambda label: 1 if label == 'normal.' else -1)
data['label_four'] = data.label.apply(lambda label: constants.label_to_four_attack_class[label])
data['label_four_num'] = data.label_four.apply(lambda label: constants.five_classes_to_num[label])
data.drop(constants.categorical_names + ['label', 'label_four'], axis=1, inplace=True)
data.shape
Out[2]:
In [3]:
data = data[data['label_four_num'] != 4]
data.shape
Out[3]:
In [4]:
pd.value_counts(data['label_four_num'], sort=True)
Out[4]:
In [5]:
# Normalize data
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = data.copy(deep=True)
data_scaled[constants.names_to_normalize] = min_max_scaler.fit_transform(data[constants.names_to_normalize])
In [6]:
#test-train division
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(data, test_size=0.1, random_state=42)
Y_train_bin = X_train.label_binary_num
Y_train_many_classes = X_train.label_four_num
Y_test_bin = X_test.label_binary_num
Y_test_many_classes = X_test.label_four_num
X_train.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)
X_test.drop(['label_binary_num', 'label_four_num'], axis=1, inplace=True)
In [7]:
#test-train division scaled + without features without changes
data_scaled.drop(['label_binary_num', 'label_four_num'] +\
constants.names_without_changes,
axis=1)
X_train_scaled, X_test_scaled = train_test_split(data_scaled,
test_size=0.1,
random_state=42)
In [8]:
# #KMeans for not scaled data
# from sklearn.cluster import KMeans
# from sklearn.metrics import confusion_matrix, accuracy_score
# kmeans = KMeans(n_clusters=2,
# max_iter=500,
# random_state=42)
# utils.model_validation(kmeans, X_train, X_test, Y_train_bin, Y_test_bin)
In [9]:
# One class SVM
# Not scaled data
from sklearn.svm import OneClassSVM
model = OneClassSVM(nu=0.1,
gamma=0.00005,
kernel='rbf')
utils.model_validation(model,
X_train,
X_test,
Y_train_bin,
Y_test_bin,
True)
In [10]:
# One class SVM
# scaled data
from sklearn.svm import OneClassSVM
model = OneClassSVM(nu=0.1,
gamma=0.00005,
kernel='rbf')
utils.model_validation(model,
X_train_scaled,
X_test_scaled,
Y_train_bin,
Y_test_bin,
True)