In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline
In [3]:
class dataset:
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]
kdd_train = pd.read_csv("dataset/KDDTrain+_20Percent.txt",names = col_names,)
kdd_test = pd.read_csv("dataset/KDDTest-21.txt",names = col_names,)
kdd_diff_level_train = kdd_train["difficulty_level"].copy()
kdd_diff_level_test = kdd_test["difficulty_level"].copy()
kdd_train = kdd_train.drop("difficulty_level", axis = 1)
kdd_test = kdd_test.drop("difficulty_level", axis = 1)
kdd_train.to_csv("dataset/KDDTrain+_20Percent.csv")
kdd_test.to_csv("dataset/KDDTest-21.csv")
In [4]:
category_variables = ["protocol_type","service","flag"]
for cv in category_variables:
dataset.kdd_train[cv] = dataset.kdd_train[cv].astype("category")
dataset.kdd_test[cv] = dataset.kdd_test[cv].astype("category",
categories = dataset.kdd_train[cv].cat.categories)
print("Length of Categories for {} are {}".format(cv , len(dataset.kdd_train[cv].cat.categories)))
print("Categories for {} are {} \n".format(cv ,dataset.kdd_train[cv].cat.categories))
In [4]:
dataset.kdd_train
Out[4]:
In [5]:
dataset.kdd_test
Out[5]:
In [6]:
dataset.kdd_train.describe()
Out[6]:
In [7]:
dataset.kdd_test.describe()
Out[7]:
In [8]:
print("Column - Label")
print("Unique values: \n{}".format(dataset.kdd_train.label))
print("\nStatistical properties: \n{}".format(dataset.kdd_train.label.describe()))
In [9]:
attack_types = {
'normal': 'normal',
'back': 'DoS',
'land': 'DoS',
'neptune': 'DoS',
'pod': 'DoS',
'smurf': 'DoS',
'teardrop': 'DoS',
'mailbomb': 'DoS',
'apache2': 'DoS',
'processtable': 'DoS',
'udpstorm': 'DoS',
'ipsweep': 'Probe',
'nmap': 'Probe',
'portsweep': 'Probe',
'satan': 'Probe',
'mscan': 'Probe',
'saint': 'Probe',
'ftp_write': 'R2L',
'guess_passwd': 'R2L',
'imap': 'R2L',
'multihop': 'R2L',
'phf': 'R2L',
'spy': 'R2L',
'warezclient': 'R2L',
'warezmaster': 'R2L',
'sendmail': 'R2L',
'named': 'R2L',
'snmpgetattack': 'R2L',
'snmpguess': 'R2L',
'xlock': 'R2L',
'xsnoop': 'R2L',
'worm': 'R2L',
'buffer_overflow': 'U2R',
'loadmodule': 'U2R',
'perl': 'U2R',
'rootkit': 'U2R',
'httptunnel': 'U2R',
'ps': 'U2R',
'sqlattack': 'U2R',
'xterm': 'U2R'
}
is_attack = {
"DoS":"Attack",
"R2L":"Attack",
"U2R":"Attack",
"Probe":"Attack",
"normal":"Normal"
}
In [10]:
dataset.kdd_train["type"] = dataset.kdd_train.label.map(lambda x: attack_types[x])
dataset.kdd_train["is"] = dataset.kdd_train.type.map(lambda x: is_attack[x])
dataset.kdd_test["type"] = dataset.kdd_test.label.map(lambda x: attack_types[x])
dataset.kdd_test["is"] = dataset.kdd_test.type.map(lambda x: is_attack[x])
In [11]:
kdd_attack_type_group = dataset.kdd_train.groupby("type")
kdd_is_attack_group = dataset.kdd_train.groupby("is")
In [12]:
kdd_attack_type_group.type.count()
Out[12]:
In [13]:
kdd_is_attack_group["is"].count()
Out[13]:
In [14]:
df = dataset.kdd_train.set_index("is")
df.loc["Attack"].label.unique()
Out[14]:
In [15]:
df.loc["Normal"].label.unique()
Out[15]:
In [16]:
kdd_is_attack_group.hist(figsize=[25,22])
Out[16]:
In [17]:
kdd_attack_type_group.hist(figsize=[25,22])
Out[17]:
In [18]:
gb = dataset.kdd_diff_level_train.groupby(dataset.kdd_diff_level_train)
(gb.count() / dataset.kdd_diff_level_train.count())*100
Out[18]:
In [19]:
gb = dataset.kdd_diff_level_test.groupby(dataset.kdd_diff_level_test)
(gb.count() / dataset.kdd_diff_level_test.count())*100
Out[19]:
In [20]:
dummy_variables_2labels = [*category_variables, "is"]
dummy_variables_5labels = [*category_variables, "type"]
class preprocessing:
kdd_train_2labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_train_5labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_test_2labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_test_5labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_train_2labels_y = dataset.kdd_train["is"].copy() # For SVM
kdd_train_5labels_y = dataset.kdd_train["type"].copy() # For SVM
kdd_test_2labels_y = dataset.kdd_test["is"].copy() # For SVM
kdd_test_5labels_y = dataset.kdd_test["type"].copy() # For SVM
kdd_train_2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_test_2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_train_5labels.drop(["label", "is"], axis=1, inplace=True)
kdd_test_5labels.drop(["label", "is"], axis=1, inplace=True)
In [21]:
preprocessing.kdd_train_2labels.columns.to_series().to_csv("dataset/columns_2labels.csv")
preprocessing.kdd_train_5labels.columns.to_series().to_csv("dataset/columns_5labels.csv")
In [22]:
preprocessing.kdd_train_2labels.shape
Out[22]:
In [23]:
preprocessing.kdd_train_5labels.shape
Out[23]:
In [24]:
preprocessing.kdd_test_2labels.shape
Out[24]:
In [25]:
preprocessing.kdd_test_5labels.shape
Out[25]:
In [26]:
preprocessing.kdd_train_2labels_y.shape
Out[26]:
In [27]:
preprocessing.kdd_test_2labels_y.shape
Out[27]:
In [28]:
preprocessing.kdd_train_5labels_y.shape
Out[28]:
In [29]:
preprocessing.kdd_test_5labels_y.shape
Out[29]:
In [31]:
preprocessing.kdd_train_2labels.to_pickle("dataset/kdd_train_2labels_20percent.pkl")
preprocessing.kdd_train_2labels_y.to_pickle("dataset/kdd_train_2labels_y_20percent.pkl")
preprocessing.kdd_train_5labels.to_pickle("dataset/kdd_train_5labels_20percent.pkl")
preprocessing.kdd_train_5labels_y.to_pickle("dataset/kdd_train_5labels_y_20percent.pkl")
preprocessing.kdd_test_2labels.to_pickle("dataset/kdd_test_2labels_20percent.pkl")
preprocessing.kdd_test_2labels_y.to_pickle("dataset/kdd_test_2labels_y_20percent.pkl")
preprocessing.kdd_test_5labels.to_pickle("dataset/kdd_test_5labels_20percent.pkl")
preprocessing.kdd_test_5labels_y.to_pickle("dataset/kdd_test_5labels_y_20percent.pkl")
dataset.kdd_diff_level_train.to_pickle("dataset/kdd_diff_level_train_20percent.pkl")
dataset.kdd_diff_level_test.to_pickle("dataset/kdd_diff_level_test_20percent.pkl")
In [ ]: