In [3]:
    
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline
    
In [4]:
    
class dataset:
    col_names = ["duration","protocol_type","service","flag","src_bytes",
        "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
        "logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
        "is_host_login","is_guest_login","count","srv_count","serror_rate",
        "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
        "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
        "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
        "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]
    kdd_train = pd.read_csv("dataset/KDDTrain+.txt",names = col_names,)
    kdd_test = pd.read_csv("dataset/KDDTest+.txt",names = col_names,)
    
    kdd_train_ = pd.read_csv("dataset/KDDTrain+_20Percent.txt",names = col_names,)
    kdd_test_ = pd.read_csv("dataset/KDDTest-21.txt",names = col_names,)
    
    kdd_diff_level_train = kdd_train["difficulty_level"].copy()
    kdd_diff_level_test = kdd_test["difficulty_level"].copy()
    
    kdd_train = kdd_train.drop("difficulty_level", axis = 1)
    kdd_test = kdd_test.drop("difficulty_level", axis = 1)
    
    kdd_train_ = kdd_train_.drop("difficulty_level", axis = 1) #labels ['difficulty_level'] not contained in axis
    kdd_test_ = kdd_test_.drop("difficulty_level", axis = 1)
    
    kdd_train.to_csv("dataset/KDDTrain+.csv")
    kdd_test.to_csv("dataset/KDDTest+.csv")
    
    kdd_train_.to_csv("dataset/KDDTrain_.csv")
    kdd_test_.to_csv("dataset/KDDTest_.csv")
    
In [5]:
    
category_variables = ["protocol_type","service","flag"]
for cv in category_variables:
    dataset.kdd_train[cv] = dataset.kdd_train[cv].astype("category")
    dataset.kdd_test[cv] = dataset.kdd_test[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    
    dataset.kdd_train_[cv] = dataset.kdd_train_[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    dataset.kdd_test_[cv] = dataset.kdd_test_[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    
    print("Length of Categories for {} are {}".format(cv , len(dataset.kdd_train[cv].cat.categories)))
    print("Categories for {} are {} \n".format(cv ,dataset.kdd_train[cv].cat.categories))
    
    
    
In [6]:
    
dataset.kdd_train
    
    Out[6]:
In [7]:
    
dataset.kdd_test
    
    Out[7]:
In [8]:
    
dataset.kdd_train.describe()
    
    Out[8]:
In [9]:
    
a = dataset.kdd_train.isin([0])
a.sum().sum() / a.size
    
    Out[9]:
In [10]:
    
dataset.kdd_test.describe()
    
    Out[10]:
In [11]:
    
print("Column - Label")
print("Unique values: \n{}".format(dataset.kdd_train.label))
print("\nStatistical properties: \n{}".format(dataset.kdd_train.label.describe()))
    
    
In [12]:
    
attack_types = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',
    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}
is_attack = {
    "DoS":"Attack",
    "R2L":"Attack",
    "U2R":"Attack",
    "Probe":"Attack",
    "normal":"Normal"
}
    
In [13]:
    
dataset.kdd_train["type"] = dataset.kdd_train.label.map(lambda x: attack_types[x])
dataset.kdd_train["is"] = dataset.kdd_train.type.map(lambda x: is_attack[x])
dataset.kdd_test["type"] = dataset.kdd_test.label.map(lambda x: attack_types[x])
dataset.kdd_test["is"] = dataset.kdd_test.type.map(lambda x: is_attack[x])
dataset.kdd_train_["type"] = dataset.kdd_train_.label.map(lambda x: attack_types[x])
dataset.kdd_train_["is"] = dataset.kdd_train_.type.map(lambda x: is_attack[x])
dataset.kdd_test_["type"] = dataset.kdd_test_.label.map(lambda x: attack_types[x])
dataset.kdd_test_["is"] = dataset.kdd_test_.type.map(lambda x: is_attack[x])
    
In [14]:
    
a = dataset.kdd_train.set_index("is")
print(a.loc["Normal"].isin([0]).sum().sum())
print(a.loc["Normal"].size)
a.loc["Normal"].isin([0]).sum().sum() / a.loc["Normal"].size
    
    
    Out[14]:
In [15]:
    
a = dataset.kdd_train.set_index("is")
print(a.loc["Attack"].isin([0]).sum().sum())
print(a.loc["Attack"].size)
a.loc["Attack"].isin([0]).sum().sum() / a.loc["Attack"].size
    
    
    Out[15]:
In [16]:
    
1804888 / (1804888 + 1538253)
    
    Out[16]:
In [17]:
    
kdd_attack_type_group = dataset.kdd_train.groupby("type")
kdd_is_attack_group = dataset.kdd_train.groupby("is")
    
In [18]:
    
kdd_attack_type_group.type.count()
    
    Out[18]:
In [19]:
    
kdd_is_attack_group["is"].count()
    
    Out[19]:
In [20]:
    
kdd_attack_type_group
    
    Out[20]:
In [21]:
    
df = dataset.kdd_train.set_index("is")
df.loc["Attack"].label.unique()
    
    Out[21]:
In [22]:
    
df.loc["Normal"].label.unique()
    
    Out[22]:
In [23]:
    
#kdd_is_attack_group.hist(figsize=[25,22])
    
In [24]:
    
#kdd_attack_type_group.hist(figsize=[25,22])
    
In [25]:
    
gb = dataset.kdd_diff_level_train.groupby(dataset.kdd_diff_level_train)
(gb.count() / dataset.kdd_diff_level_train.count())*100
    
    Out[25]:
In [26]:
    
gb = dataset.kdd_diff_level_test.groupby(dataset.kdd_diff_level_test)
(gb.count() / dataset.kdd_diff_level_test.count())*100
    
    Out[26]:
In [ ]:
    
    
In [27]:
    
dummy_variables_2labels = [*category_variables, "is"]
dummy_variables_5labels = [*category_variables, "type"]
attack_codes_2labels = {"Attack":1, "Normal":0}
attack_codes_5labels = {'DoS':1, 'normal':0, 'Probe':2, 'R2L':3, 'U2R':4}
    
class preprocessing:
    kdd_train_2labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_train_5labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
    kdd_test_2labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_test_5labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
    kdd_train__2labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_train__5labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
        
    kdd_test__2labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_test__5labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
    kdd_train_2labels_y = dataset.kdd_train["is"].copy() # For SVM
    kdd_train_5labels_y = dataset.kdd_train["type"].copy() # For SVM
    kdd_test_2labels_y = dataset.kdd_test["is"].copy() # For SVM
    kdd_test_5labels_y = dataset.kdd_test["type"].copy() # For SVM
    
    kdd_train__2labels_y = dataset.kdd_train_["is"].copy() # For SVM
    kdd_train__5labels_y = dataset.kdd_train_["type"].copy() # For SVM
    
    kdd_test__2labels_y = dataset.kdd_test_["is"].copy() # For SVM
    kdd_test__5labels_y = dataset.kdd_test_["type"].copy() # For SVM
    kdd_train_2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_test_2labels.drop(["label", "type"], axis=1, inplace=True)
    
    kdd_train__2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_test__2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_train_5labels.drop(["label", "is"], axis=1, inplace=True)
    kdd_test_5labels.drop(["label", "is"], axis=1, inplace=True)
    
    kdd_train__5labels.drop(["label", "is"], axis=1, inplace=True)
    kdd_test__5labels.drop(["label", "is"], axis=1, inplace=True)
    
    kdd_train_2labels_y = kdd_train_2labels_y.map(lambda x: attack_codes_2labels[x])
    kdd_test_2labels_y = kdd_test_2labels_y.map(lambda x: attack_codes_2labels[x])
    
    kdd_train__2labels_y = kdd_train__2labels_y.map(lambda x: attack_codes_2labels[x])
    kdd_test__2labels_y = kdd_test__2labels_y.map(lambda x: attack_codes_2labels[x])
    
    kdd_train_5labels_y = kdd_train_5labels_y.map(lambda x: attack_codes_5labels[x])
    kdd_test_5labels_y = kdd_test_5labels_y.map(lambda x: attack_codes_5labels[x])
    
    kdd_train__5labels_y = kdd_train__5labels_y.map(lambda x: attack_codes_5labels[x])
    kdd_test__5labels_y = kdd_test__5labels_y.map(lambda x: attack_codes_5labels[x])
    
In [ ]:
    
    
In [28]:
    
preprocessing.kdd_train_2labels.columns.to_series().to_csv("dataset/columns_2labels.csv")
preprocessing.kdd_train_5labels.columns.to_series().to_csv("dataset/columns_5labels.csv")
    
In [29]:
    
preprocessing.kdd_train_2labels.columns
    
    Out[29]:
In [30]:
    
preprocessing.kdd_train_2labels.shape
    
    Out[30]:
In [31]:
    
preprocessing.kdd_train_5labels.shape
    
    Out[31]:
In [32]:
    
preprocessing.kdd_test_2labels.shape
    
    Out[32]:
In [33]:
    
preprocessing.kdd_test_5labels.shape
    
    Out[33]:
In [34]:
    
preprocessing.kdd_train_2labels_y.shape
    
    Out[34]:
In [35]:
    
preprocessing.kdd_test_2labels_y.shape
    
    Out[35]:
In [36]:
    
preprocessing.kdd_train_5labels_y.shape
    
    Out[36]:
In [37]:
    
preprocessing.kdd_test_5labels_y.shape
    
    Out[37]:
In [38]:
    
import matplotlib
from pandas.plotting import andrews_curves
from pandas.plotting import parallel_coordinates
from sklearn import preprocessing as ps
from pandas.plotting import radviz
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
    
In [39]:
    
df_train = preprocessing.kdd_train_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_test = preprocessing.kdd_test_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_train = pd.concat([df_train, preprocessing.kdd_train_2labels_y], axis = 1)
df_test = pd.concat([df_test, preprocessing.kdd_test_2labels_y], axis = 1)
    
In [40]:
    
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
#np.set_printoptions(suppress=True)
#sample = df_train.sample(int(df_train.shape[0]*.1)) # 10% of total data
#sample.to_pickle("dataset/tsne_sample.pkl")
sample = pd.read_pickle("dataset/tsne_sample.pkl")
    
In [ ]:
    
    
In [41]:
    
x_tsne = sample.iloc[:, :-1]
y_tsne = sample.iloc[:, -1]
from sklearn.decomposition import SparsePCA
pca_analysis = SparsePCA(n_components=40)
#x_tsne_pca = pca_analysis.fit_transform(x_tsne)
    
In [42]:
    
#pd.DataFrame(x_tsne_pca).to_pickle("dataset/tsne_pca_df.pkl")
x_tsne_pca = pd.read_pickle("dataset/tsne_pca_df.pkl").values
    
In [43]:
    
x_tsne_pca_df = pd.DataFrame(x_tsne_pca)
codes_to_attack = {1:"Attack", 0:"Normal"}
y_tsne_cta = y_tsne.map(lambda x: codes_to_attack[x])
x_tsne_pca_df['is'] = y_tsne_cta.values
    
In [44]:
    
plt.figure(figsize=(7,3))
andrews_curves(x_tsne_pca_df, "is")
    
    Out[44]:
    
In [45]:
    
#df = model.fit_transform(x_tsne_pca) 
#df1 = model.fit_transform(df)
#df2 = model.fit_transform(df1) 
#df3 = model.fit_transform(df2)
    
In [46]:
    
#pd.DataFrame(df).to_pickle("dataset/tsne_df.pkl")
#pd.DataFrame(df1).to_pickle("dataset/tsne_df1.pkl")
#pd.DataFrame(df2).to_pickle("dataset/tsne_df2.pkl")
#pd.DataFrame(df3).to_pickle("dataset/tsne_df3.pkl")
    
In [47]:
    
df = pd.read_pickle("dataset/tsne_df.pkl").values
df1 = pd.read_pickle("dataset/tsne_df1.pkl").values
df2 = pd.read_pickle("dataset/tsne_df2.pkl").values
df3 = pd.read_pickle("dataset/tsne_df3.pkl").values
    
In [48]:
    
#plt.figure(figsize=(15,8))
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10,5))
ax1.scatter(x = df[y_tsne==0,0], y = df[y_tsne==0,1], label = 'Normal')
ax1.scatter(x = df[y_tsne==1,0], y = df[y_tsne==1,1], label = 'Attack')
ax1.title.set_text("After 1000 epochs")
ax2.scatter(x = df1[y_tsne==0,0], y = df1[y_tsne==0,1], label = 'Normal')
ax2.scatter(x = df1[y_tsne==1,0], y = df1[y_tsne==1,1], label = 'Attack')
ax2.title.set_text("After 2000 epochs")
ax3.scatter(x = df2[y_tsne==0,0], y = df2[y_tsne==0,1], label = 'Normal')
ax3.scatter(x = df2[y_tsne==1,0], y = df2[y_tsne==1,1], label = 'Attack')
ax3.title.set_text("After 3000 epochs")
ax4.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
ax4.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
ax4.title.set_text("After 4000 epochs")
plt.subplots_adjust(wspace=0.05, hspace=0.18)
ax1.legend(loc=0)
    
    Out[48]:
    
In [49]:
    
plt.figure(figsize=(15,8))
plt.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
plt.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
plt.title("After 4000 epochs")
    
    Out[49]:
    
In [50]:
    
preprocessing.kdd_train_2labels.to_pickle("dataset/kdd_train_2labels.pkl")
preprocessing.kdd_train_2labels_y.to_pickle("dataset/kdd_train_2labels_y.pkl")
preprocessing.kdd_train_5labels.to_pickle("dataset/kdd_train_5labels.pkl")
preprocessing.kdd_train_5labels_y.to_pickle("dataset/kdd_train_5labels_y.pkl")
preprocessing.kdd_train__2labels.to_pickle("dataset/kdd_train__2labels.pkl")
preprocessing.kdd_train__2labels_y.to_pickle("dataset/kdd_train__2labels_y.pkl")
preprocessing.kdd_train__5labels.to_pickle("dataset/kdd_train__5labels.pkl")
preprocessing.kdd_train__5labels_y.to_pickle("dataset/kdd_train__5labels_y.pkl")
    
In [51]:
    
preprocessing.kdd_test_5labels_y.to_pickle("dataset/kdd_test_5labels_y.pkl")
preprocessing.kdd_test__5labels.to_pickle("dataset/kdd_test__5labels.pkl")
preprocessing.kdd_test__5labels_y.to_pickle("dataset/kdd_test__5labels_y.pkl")
    
In [52]:
    
dataset.kdd_diff_level_train.to_pickle("dataset/kdd_diff_level_train.pkl")
dataset.kdd_diff_level_test.to_pickle("dataset/kdd_diff_level_test.pkl")
    
In [ ]: