In [2]:
# Here are some imports that are used along this notebook
import math
import itertools
import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from time import time
import tensorflow as tf
from collections import OrderedDict
import keras
from sklearn.model_selection import KFold
from keras.utils import np_utils
%matplotlib inline
gt0 = time()


/home/hackerx/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

In [3]:
train20_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTrain+_20Percent.txt"
train_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTrain+.txt"
test_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTest+.txt"

In [4]:
col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"])

In [5]:
nominal_inx = [1, 2, 3]
binary_inx = [6, 11, 13, 14, 20, 21]
numeric_inx = list(set(range(41)).difference(nominal_inx).difference(binary_inx))

In [6]:
nominal_cols = col_names[nominal_inx].tolist()
binary_cols = col_names[binary_inx].tolist()
numeric_cols = col_names[numeric_inx].tolist()

In [ ]:


In [7]:
# Dictionary that contains mapping of various attacks to the four main categories
attack_dict_five_class = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

attack_two_class = []
for key in attack_dict_five_class.keys():
    if key == 'normal':
        pass
    else:
        attack_two_class.append(key)

In [8]:
#Load test and train data
train_df = pd.read_csv(train_nsl_kdd_dataset_path, names = col_names)
test_df  = pd.read_csv(test_nsl_kdd_dataset_path , names = col_names)
train_labels = train_df.pop('dst_host_srv_rerror_rate')
test_labels = test_df.pop('dst_host_srv_rerror_rate')
total_dataset = pd.concat([train_df, test_df])
print(total_dataset.shape)


(148517, 41)

In [9]:
# check for null values
total_dataset[total_dataset.isnull().any(axis=1)]


Out[9]:
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_count dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate labels

0 rows × 41 columns


In [10]:
test_df.head()


Out[10]:
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_count dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate labels
0 tcp private REJ 0 0 0 0 0 0 0 ... 10 0.04 0.06 0.00 0.00 0.0 0.0 1.00 1.00 21
0 tcp private REJ 0 0 0 0 0 0 0 ... 1 0.00 0.06 0.00 0.00 0.0 0.0 1.00 1.00 21
2 tcp ftp_data SF 12983 0 0 0 0 0 0 ... 86 0.61 0.04 0.61 0.02 0.0 0.0 0.00 0.00 21
0 icmp eco_i SF 20 0 0 0 0 0 0 ... 57 1.00 0.00 1.00 0.28 0.0 0.0 0.00 0.00 15
1 tcp telnet RSTO 0 15 0 0 0 0 0 ... 86 0.31 0.17 0.03 0.02 0.0 0.0 0.83 0.71 11

5 rows × 41 columns


In [ ]:


In [11]:
total_dataset = pd.get_dummies(total_dataset)
print(total_dataset.shape)
train_df = total_dataset.iloc[0:125973, :]
test_df = total_dataset.iloc[125973:, :]
print(test_df.shape, test_df.shape)


(148517, 122)
(22544, 122) (22544, 122)

In [ ]:


In [12]:
train_labels_for_two_class = pd.DataFrame(train_labels.as_matrix(), columns=["class"])
test_labels_for_two_class = pd.DataFrame(test_labels.as_matrix(), columns=["class"])

In [13]:
train_labels_for_two_class.loc[train_labels_for_two_class['class'].isin(attack_two_class) , 'class'] = 1
train_labels_for_two_class.loc[train_labels_for_two_class['class'] == "normal" , 'class'] = 0
#for test labels 
test_labels_for_two_class.loc[test_labels_for_two_class['class'].isin(attack_two_class) , 'class'] = 1
test_labels_for_two_class.loc[test_labels_for_two_class['class'] == "normal" , 'class'] = 0

In [14]:
train_labels_for_two_class = np_utils.to_categorical(train_labels_for_two_class)
test_labels_for_two_class = np_utils.to_categorical(test_labels_for_two_class)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [15]:
#min-max mazimazation for the data
from sklearn.preprocessing import MinMaxScaler
train_X = train_df.as_matrix()
train_Y = train_labels_for_two_class

test_X = test_df.as_matrix()
test_Y = test_labels_for_two_class
scaler = MinMaxScaler() 
scaler.fit_transform(train_X)
scaler.fit_transform(train_Y)
scaler.fit_transform(test_X)
scaler.fit_transform(test_Y)
# train_X = train_X[0]
# train_Y = train_Y[0]
# test_X = test_X[0]
# test_Y = test_Y[0]


Out[15]:
array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [16]:
#Training our auto encoder

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

In [17]:
input_dim = train_X.shape[1]
input_layer = Input(shape=(input_dim, ))
encoder = Dense(30, activation="sigmoid", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoder = Dense(input_dim, activation='relu')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [18]:
nb_epoch = 10
batch_size = 128
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)

tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

history = autoencoder.fit(train_X,train_X ,
                    epochs=nb_epoch,
                    verbose = 1,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(test_X, test_X),
                    callbacks=[checkpointer, tensorboard]).history


Train on 125973 samples, validate on 22544 samples
Epoch 1/10
125973/125973 [==============================] - 2s 20us/step - loss: 415028354471.2015 - acc: 0.0762 - val_loss: 1836716560.7739 - val_acc: 0.0275
Epoch 2/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028346950.1439 - acc: 0.2619 - val_loss: 1836716025.9892 - val_acc: 0.4079
Epoch 3/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028353050.9392 - acc: 0.4047 - val_loss: 1836715342.2866 - val_acc: 0.4066
Epoch 4/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028344980.8914 - acc: 0.4043 - val_loss: 1836714851.3620 - val_acc: 0.4066
Epoch 5/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028343025.6653 - acc: 0.4043 - val_loss: 1836714409.3090 - val_acc: 0.4066
Epoch 6/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028344750.8376 - acc: 0.4043 - val_loss: 1836713844.2689 - val_acc: 0.4066
Epoch 7/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028341241.9020 - acc: 0.4043 - val_loss: 1836713278.4134 - val_acc: 0.4066
Epoch 8/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028339023.6426 - acc: 0.4043 - val_loss: 1836712947.9640 - val_acc: 0.4066
Epoch 9/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028340158.6280 - acc: 0.4140 - val_loss: 1836712356.3883 - val_acc: 0.4134
Epoch 10/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028334190.5493 - acc: 0.4360 - val_loss: 1836711982.9623 - val_acc: 0.4219

In [19]:
encoder = Model(input_layer, encoder)

In [20]:
out2 = Dense(2, activation='softmax')(encoder.output)
newmodel = Model(encoder.input,out2)

In [21]:
test_X.shape


Out[21]:
(22544, 122)

In [ ]:
from keras import optimizers
opt = optimizers.SGD(lr=0.00001)
newmodel.compile(loss='categorical_crossentropy',
          optimizer=opt, 
          metrics=['accuracy']) 

newmodel.fit(train_X, train_Y,
      epochs=10,
      batch_size=128,
      shuffle=True,
      validation_data=(test_X, test_Y))

In [23]:
scores = newmodel.evaluate(test_X, test_Y, verbose=1, steps=50) 
print("Accuracy: ", scores[1])


50/50 [==============================] - 1s 11ms/step
Accuracy:  0.7330996990203857

In [24]:
newmodel.evaluate??

In [ ]:


In [ ]:


In [ ]:


In [ ]: