In [3]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline
In [4]:
class dataset:
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]
kdd_train = pd.read_csv("dataset/KDDTrain+.txt",names = col_names,)
kdd_test = pd.read_csv("dataset/KDDTest+.txt",names = col_names,)
kdd_train_ = pd.read_csv("dataset/KDDTrain+_20Percent.txt",names = col_names,)
kdd_test_ = pd.read_csv("dataset/KDDTest-21.txt",names = col_names,)
kdd_diff_level_train = kdd_train["difficulty_level"].copy()
kdd_diff_level_test = kdd_test["difficulty_level"].copy()
kdd_train = kdd_train.drop("difficulty_level", axis = 1)
kdd_test = kdd_test.drop("difficulty_level", axis = 1)
kdd_train_ = kdd_train_.drop("difficulty_level", axis = 1) #labels ['difficulty_level'] not contained in axis
kdd_test_ = kdd_test_.drop("difficulty_level", axis = 1)
kdd_train.to_csv("dataset/KDDTrain+.csv")
kdd_test.to_csv("dataset/KDDTest+.csv")
kdd_train_.to_csv("dataset/KDDTrain_.csv")
kdd_test_.to_csv("dataset/KDDTest_.csv")
In [5]:
category_variables = ["protocol_type","service","flag"]
for cv in category_variables:
dataset.kdd_train[cv] = dataset.kdd_train[cv].astype("category")
dataset.kdd_test[cv] = dataset.kdd_test[cv].astype("category",
categories = dataset.kdd_train[cv].cat.categories)
dataset.kdd_train_[cv] = dataset.kdd_train_[cv].astype("category",
categories = dataset.kdd_train[cv].cat.categories)
dataset.kdd_test_[cv] = dataset.kdd_test_[cv].astype("category",
categories = dataset.kdd_train[cv].cat.categories)
print("Length of Categories for {} are {}".format(cv , len(dataset.kdd_train[cv].cat.categories)))
print("Categories for {} are {} \n".format(cv ,dataset.kdd_train[cv].cat.categories))
In [6]:
dataset.kdd_train
Out[6]:
In [7]:
dataset.kdd_test
Out[7]:
In [8]:
dataset.kdd_train.describe()
Out[8]:
In [9]:
a = dataset.kdd_train.isin([0])
a.sum().sum() / a.size
Out[9]:
In [10]:
dataset.kdd_test.describe()
Out[10]:
In [11]:
print("Column - Label")
print("Unique values: \n{}".format(dataset.kdd_train.label))
print("\nStatistical properties: \n{}".format(dataset.kdd_train.label.describe()))
In [12]:
attack_types = {
'normal': 'normal',
'back': 'DoS',
'land': 'DoS',
'neptune': 'DoS',
'pod': 'DoS',
'smurf': 'DoS',
'teardrop': 'DoS',
'mailbomb': 'DoS',
'apache2': 'DoS',
'processtable': 'DoS',
'udpstorm': 'DoS',
'ipsweep': 'Probe',
'nmap': 'Probe',
'portsweep': 'Probe',
'satan': 'Probe',
'mscan': 'Probe',
'saint': 'Probe',
'ftp_write': 'R2L',
'guess_passwd': 'R2L',
'imap': 'R2L',
'multihop': 'R2L',
'phf': 'R2L',
'spy': 'R2L',
'warezclient': 'R2L',
'warezmaster': 'R2L',
'sendmail': 'R2L',
'named': 'R2L',
'snmpgetattack': 'R2L',
'snmpguess': 'R2L',
'xlock': 'R2L',
'xsnoop': 'R2L',
'worm': 'R2L',
'buffer_overflow': 'U2R',
'loadmodule': 'U2R',
'perl': 'U2R',
'rootkit': 'U2R',
'httptunnel': 'U2R',
'ps': 'U2R',
'sqlattack': 'U2R',
'xterm': 'U2R'
}
is_attack = {
"DoS":"Attack",
"R2L":"Attack",
"U2R":"Attack",
"Probe":"Attack",
"normal":"Normal"
}
In [13]:
dataset.kdd_train["type"] = dataset.kdd_train.label.map(lambda x: attack_types[x])
dataset.kdd_train["is"] = dataset.kdd_train.type.map(lambda x: is_attack[x])
dataset.kdd_test["type"] = dataset.kdd_test.label.map(lambda x: attack_types[x])
dataset.kdd_test["is"] = dataset.kdd_test.type.map(lambda x: is_attack[x])
dataset.kdd_train_["type"] = dataset.kdd_train_.label.map(lambda x: attack_types[x])
dataset.kdd_train_["is"] = dataset.kdd_train_.type.map(lambda x: is_attack[x])
dataset.kdd_test_["type"] = dataset.kdd_test_.label.map(lambda x: attack_types[x])
dataset.kdd_test_["is"] = dataset.kdd_test_.type.map(lambda x: is_attack[x])
In [14]:
a = dataset.kdd_train.set_index("is")
print(a.loc["Normal"].isin([0]).sum().sum())
print(a.loc["Normal"].size)
a.loc["Normal"].isin([0]).sum().sum() / a.loc["Normal"].size
Out[14]:
In [15]:
a = dataset.kdd_train.set_index("is")
print(a.loc["Attack"].isin([0]).sum().sum())
print(a.loc["Attack"].size)
a.loc["Attack"].isin([0]).sum().sum() / a.loc["Attack"].size
Out[15]:
In [16]:
1804888 / (1804888 + 1538253)
Out[16]:
In [17]:
kdd_attack_type_group = dataset.kdd_train.groupby("type")
kdd_is_attack_group = dataset.kdd_train.groupby("is")
In [18]:
kdd_attack_type_group.type.count()
Out[18]:
In [19]:
kdd_is_attack_group["is"].count()
Out[19]:
In [20]:
kdd_attack_type_group
Out[20]:
In [21]:
df = dataset.kdd_train.set_index("is")
df.loc["Attack"].label.unique()
Out[21]:
In [22]:
df.loc["Normal"].label.unique()
Out[22]:
In [23]:
#kdd_is_attack_group.hist(figsize=[25,22])
In [24]:
#kdd_attack_type_group.hist(figsize=[25,22])
In [25]:
gb = dataset.kdd_diff_level_train.groupby(dataset.kdd_diff_level_train)
(gb.count() / dataset.kdd_diff_level_train.count())*100
Out[25]:
In [26]:
gb = dataset.kdd_diff_level_test.groupby(dataset.kdd_diff_level_test)
(gb.count() / dataset.kdd_diff_level_test.count())*100
Out[26]:
In [ ]:
In [27]:
dummy_variables_2labels = [*category_variables, "is"]
dummy_variables_5labels = [*category_variables, "type"]
attack_codes_2labels = {"Attack":1, "Normal":0}
attack_codes_5labels = {'DoS':1, 'normal':0, 'Probe':2, 'R2L':3, 'U2R':4}
class preprocessing:
kdd_train_2labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_train_5labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_test_2labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_test_5labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_train__2labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_train__5labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_test__2labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
kdd_test__5labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
kdd_train_2labels_y = dataset.kdd_train["is"].copy() # For SVM
kdd_train_5labels_y = dataset.kdd_train["type"].copy() # For SVM
kdd_test_2labels_y = dataset.kdd_test["is"].copy() # For SVM
kdd_test_5labels_y = dataset.kdd_test["type"].copy() # For SVM
kdd_train__2labels_y = dataset.kdd_train_["is"].copy() # For SVM
kdd_train__5labels_y = dataset.kdd_train_["type"].copy() # For SVM
kdd_test__2labels_y = dataset.kdd_test_["is"].copy() # For SVM
kdd_test__5labels_y = dataset.kdd_test_["type"].copy() # For SVM
kdd_train_2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_test_2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_train__2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_test__2labels.drop(["label", "type"], axis=1, inplace=True)
kdd_train_5labels.drop(["label", "is"], axis=1, inplace=True)
kdd_test_5labels.drop(["label", "is"], axis=1, inplace=True)
kdd_train__5labels.drop(["label", "is"], axis=1, inplace=True)
kdd_test__5labels.drop(["label", "is"], axis=1, inplace=True)
kdd_train_2labels_y = kdd_train_2labels_y.map(lambda x: attack_codes_2labels[x])
kdd_test_2labels_y = kdd_test_2labels_y.map(lambda x: attack_codes_2labels[x])
kdd_train__2labels_y = kdd_train__2labels_y.map(lambda x: attack_codes_2labels[x])
kdd_test__2labels_y = kdd_test__2labels_y.map(lambda x: attack_codes_2labels[x])
kdd_train_5labels_y = kdd_train_5labels_y.map(lambda x: attack_codes_5labels[x])
kdd_test_5labels_y = kdd_test_5labels_y.map(lambda x: attack_codes_5labels[x])
kdd_train__5labels_y = kdd_train__5labels_y.map(lambda x: attack_codes_5labels[x])
kdd_test__5labels_y = kdd_test__5labels_y.map(lambda x: attack_codes_5labels[x])
In [ ]:
In [28]:
preprocessing.kdd_train_2labels.columns.to_series().to_csv("dataset/columns_2labels.csv")
preprocessing.kdd_train_5labels.columns.to_series().to_csv("dataset/columns_5labels.csv")
In [29]:
preprocessing.kdd_train_2labels.columns
Out[29]:
In [30]:
preprocessing.kdd_train_2labels.shape
Out[30]:
In [31]:
preprocessing.kdd_train_5labels.shape
Out[31]:
In [32]:
preprocessing.kdd_test_2labels.shape
Out[32]:
In [33]:
preprocessing.kdd_test_5labels.shape
Out[33]:
In [34]:
preprocessing.kdd_train_2labels_y.shape
Out[34]:
In [35]:
preprocessing.kdd_test_2labels_y.shape
Out[35]:
In [36]:
preprocessing.kdd_train_5labels_y.shape
Out[36]:
In [37]:
preprocessing.kdd_test_5labels_y.shape
Out[37]:
In [38]:
import matplotlib
from pandas.plotting import andrews_curves
from pandas.plotting import parallel_coordinates
from sklearn import preprocessing as ps
from pandas.plotting import radviz
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
In [39]:
df_train = preprocessing.kdd_train_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_test = preprocessing.kdd_test_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_train = pd.concat([df_train, preprocessing.kdd_train_2labels_y], axis = 1)
df_test = pd.concat([df_test, preprocessing.kdd_test_2labels_y], axis = 1)
In [40]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
#np.set_printoptions(suppress=True)
#sample = df_train.sample(int(df_train.shape[0]*.1)) # 10% of total data
#sample.to_pickle("dataset/tsne_sample.pkl")
sample = pd.read_pickle("dataset/tsne_sample.pkl")
In [ ]:
In [41]:
x_tsne = sample.iloc[:, :-1]
y_tsne = sample.iloc[:, -1]
from sklearn.decomposition import SparsePCA
pca_analysis = SparsePCA(n_components=40)
#x_tsne_pca = pca_analysis.fit_transform(x_tsne)
In [42]:
#pd.DataFrame(x_tsne_pca).to_pickle("dataset/tsne_pca_df.pkl")
x_tsne_pca = pd.read_pickle("dataset/tsne_pca_df.pkl").values
In [43]:
x_tsne_pca_df = pd.DataFrame(x_tsne_pca)
codes_to_attack = {1:"Attack", 0:"Normal"}
y_tsne_cta = y_tsne.map(lambda x: codes_to_attack[x])
x_tsne_pca_df['is'] = y_tsne_cta.values
In [44]:
plt.figure(figsize=(7,3))
andrews_curves(x_tsne_pca_df, "is")
Out[44]:
In [45]:
#df = model.fit_transform(x_tsne_pca)
#df1 = model.fit_transform(df)
#df2 = model.fit_transform(df1)
#df3 = model.fit_transform(df2)
In [46]:
#pd.DataFrame(df).to_pickle("dataset/tsne_df.pkl")
#pd.DataFrame(df1).to_pickle("dataset/tsne_df1.pkl")
#pd.DataFrame(df2).to_pickle("dataset/tsne_df2.pkl")
#pd.DataFrame(df3).to_pickle("dataset/tsne_df3.pkl")
In [47]:
df = pd.read_pickle("dataset/tsne_df.pkl").values
df1 = pd.read_pickle("dataset/tsne_df1.pkl").values
df2 = pd.read_pickle("dataset/tsne_df2.pkl").values
df3 = pd.read_pickle("dataset/tsne_df3.pkl").values
In [48]:
#plt.figure(figsize=(15,8))
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10,5))
ax1.scatter(x = df[y_tsne==0,0], y = df[y_tsne==0,1], label = 'Normal')
ax1.scatter(x = df[y_tsne==1,0], y = df[y_tsne==1,1], label = 'Attack')
ax1.title.set_text("After 1000 epochs")
ax2.scatter(x = df1[y_tsne==0,0], y = df1[y_tsne==0,1], label = 'Normal')
ax2.scatter(x = df1[y_tsne==1,0], y = df1[y_tsne==1,1], label = 'Attack')
ax2.title.set_text("After 2000 epochs")
ax3.scatter(x = df2[y_tsne==0,0], y = df2[y_tsne==0,1], label = 'Normal')
ax3.scatter(x = df2[y_tsne==1,0], y = df2[y_tsne==1,1], label = 'Attack')
ax3.title.set_text("After 3000 epochs")
ax4.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
ax4.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
ax4.title.set_text("After 4000 epochs")
plt.subplots_adjust(wspace=0.05, hspace=0.18)
ax1.legend(loc=0)
Out[48]:
In [49]:
plt.figure(figsize=(15,8))
plt.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
plt.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
plt.title("After 4000 epochs")
Out[49]:
In [50]:
preprocessing.kdd_train_2labels.to_pickle("dataset/kdd_train_2labels.pkl")
preprocessing.kdd_train_2labels_y.to_pickle("dataset/kdd_train_2labels_y.pkl")
preprocessing.kdd_train_5labels.to_pickle("dataset/kdd_train_5labels.pkl")
preprocessing.kdd_train_5labels_y.to_pickle("dataset/kdd_train_5labels_y.pkl")
preprocessing.kdd_train__2labels.to_pickle("dataset/kdd_train__2labels.pkl")
preprocessing.kdd_train__2labels_y.to_pickle("dataset/kdd_train__2labels_y.pkl")
preprocessing.kdd_train__5labels.to_pickle("dataset/kdd_train__5labels.pkl")
preprocessing.kdd_train__5labels_y.to_pickle("dataset/kdd_train__5labels_y.pkl")
In [51]:
preprocessing.kdd_test_5labels_y.to_pickle("dataset/kdd_test_5labels_y.pkl")
preprocessing.kdd_test__5labels.to_pickle("dataset/kdd_test__5labels.pkl")
preprocessing.kdd_test__5labels_y.to_pickle("dataset/kdd_test__5labels_y.pkl")
In [52]:
dataset.kdd_diff_level_train.to_pickle("dataset/kdd_diff_level_train.pkl")
dataset.kdd_diff_level_test.to_pickle("dataset/kdd_diff_level_test.pkl")
In [ ]: