notebook.community

Edit and run



In [2]:

    
# Here are some imports that are used along this notebook
import math
import itertools
import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from time import time
import tensorflow as tf
from collections import OrderedDict
import keras
from sklearn.model_selection import KFold
from keras.utils import np_utils
%matplotlib inline
gt0 = time()









    



/home/hackerx/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



In [3]:

    
train20_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTrain+_20Percent.txt"
train_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTrain+.txt"
test_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTest+.txt"



In [4]:

    
col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"])



In [5]:

    
nominal_inx = [1, 2, 3]
binary_inx = [6, 11, 13, 14, 20, 21]
numeric_inx = list(set(range(41)).difference(nominal_inx).difference(binary_inx))



In [6]:

    
nominal_cols = col_names[nominal_inx].tolist()
binary_cols = col_names[binary_inx].tolist()
numeric_cols = col_names[numeric_inx].tolist()



In [ ]:



In [7]:

    
# Dictionary that contains mapping of various attacks to the four main categories
attack_dict_five_class = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

attack_two_class = []
for key in attack_dict_five_class.keys():
    if key == 'normal':
        pass
    else:
        attack_two_class.append(key)



In [8]:

    
#Load test and train data
train_df = pd.read_csv(train_nsl_kdd_dataset_path, names = col_names)
test_df  = pd.read_csv(test_nsl_kdd_dataset_path , names = col_names)
train_labels = train_df.pop('dst_host_srv_rerror_rate')
test_labels = test_df.pop('dst_host_srv_rerror_rate')
total_dataset = pd.concat([train_df, test_df])
print(total_dataset.shape)









    



(148517, 41)



In [9]:

    
# check for null values
total_dataset[total_dataset.isnull().any(axis=1)]









    Out[9]:







  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      labels
    
  
  
  

0 rows × 41 columns



In [10]:

    
test_df.head()









    Out[10]:







  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      labels
    
  
  
    
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      0
      ...
      10
      0.04
      0.06
      0.00
      0.00
      0.0
      0.0
      1.00
      1.00
      21
    
    
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.06
      0.00
      0.00
      0.0
      0.0
      1.00
      1.00
      21
    
    
      2
      tcp
      ftp_data
      SF
      12983
      0
      0
      0
      0
      0
      0
      ...
      86
      0.61
      0.04
      0.61
      0.02
      0.0
      0.0
      0.00
      0.00
      21
    
    
      0
      icmp
      eco_i
      SF
      20
      0
      0
      0
      0
      0
      0
      ...
      57
      1.00
      0.00
      1.00
      0.28
      0.0
      0.0
      0.00
      0.00
      15
    
    
      1
      tcp
      telnet
      RSTO
      0
      15
      0
      0
      0
      0
      0
      ...
      86
      0.31
      0.17
      0.03
      0.02
      0.0
      0.0
      0.83
      0.71
      11
    
  

5 rows × 41 columns



In [ ]:



In [11]:

    
total_dataset = pd.get_dummies(total_dataset)
print(total_dataset.shape)
train_df = total_dataset.iloc[0:125973, :]
test_df = total_dataset.iloc[125973:, :]
print(test_df.shape, test_df.shape)









    



(148517, 122)
(22544, 122) (22544, 122)



In [ ]:



In [12]:

    
train_labels_for_two_class = pd.DataFrame(train_labels.as_matrix(), columns=["class"])
test_labels_for_two_class = pd.DataFrame(test_labels.as_matrix(), columns=["class"])



In [13]:

    
train_labels_for_two_class.loc[train_labels_for_two_class['class'].isin(attack_two_class) , 'class'] = 1
train_labels_for_two_class.loc[train_labels_for_two_class['class'] == "normal" , 'class'] = 0
#for test labels 
test_labels_for_two_class.loc[test_labels_for_two_class['class'].isin(attack_two_class) , 'class'] = 1
test_labels_for_two_class.loc[test_labels_for_two_class['class'] == "normal" , 'class'] = 0



In [14]:

    
train_labels_for_two_class = np_utils.to_categorical(train_labels_for_two_class)
test_labels_for_two_class = np_utils.to_categorical(test_labels_for_two_class)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [15]:

    
#min-max mazimazation for the data
from sklearn.preprocessing import MinMaxScaler
train_X = train_df.as_matrix()
train_Y = train_labels_for_two_class

test_X = test_df.as_matrix()
test_Y = test_labels_for_two_class
scaler = MinMaxScaler() 
scaler.fit_transform(train_X)
scaler.fit_transform(train_Y)
scaler.fit_transform(test_X)
scaler.fit_transform(test_Y)
# train_X = train_X[0]
# train_Y = train_Y[0]
# test_X = test_X[0]
# test_Y = test_Y[0]









    Out[15]:





array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])



In [16]:

    
#Training our auto encoder

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers



In [17]:

    
input_dim = train_X.shape[1]
input_layer = Input(shape=(input_dim, ))
encoder = Dense(30, activation="sigmoid", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoder = Dense(input_dim, activation='relu')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)



In [18]:

    
nb_epoch = 10
batch_size = 128
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)

tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

history = autoencoder.fit(train_X,train_X ,
                    epochs=nb_epoch,
                    verbose = 1,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(test_X, test_X),
                    callbacks=[checkpointer, tensorboard]).history









    



Train on 125973 samples, validate on 22544 samples
Epoch 1/10
125973/125973 [==============================] - 2s 20us/step - loss: 415028354471.2015 - acc: 0.0762 - val_loss: 1836716560.7739 - val_acc: 0.0275
Epoch 2/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028346950.1439 - acc: 0.2619 - val_loss: 1836716025.9892 - val_acc: 0.4079
Epoch 3/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028353050.9392 - acc: 0.4047 - val_loss: 1836715342.2866 - val_acc: 0.4066
Epoch 4/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028344980.8914 - acc: 0.4043 - val_loss: 1836714851.3620 - val_acc: 0.4066
Epoch 5/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028343025.6653 - acc: 0.4043 - val_loss: 1836714409.3090 - val_acc: 0.4066
Epoch 6/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028344750.8376 - acc: 0.4043 - val_loss: 1836713844.2689 - val_acc: 0.4066
Epoch 7/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028341241.9020 - acc: 0.4043 - val_loss: 1836713278.4134 - val_acc: 0.4066
Epoch 8/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028339023.6426 - acc: 0.4043 - val_loss: 1836712947.9640 - val_acc: 0.4066
Epoch 9/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028340158.6280 - acc: 0.4140 - val_loss: 1836712356.3883 - val_acc: 0.4134
Epoch 10/10
125973/125973 [==============================] - 2s 18us/step - loss: 415028334190.5493 - acc: 0.4360 - val_loss: 1836711982.9623 - val_acc: 0.4219



In [19]:

    
encoder = Model(input_layer, encoder)



In [20]:

    
out2 = Dense(2, activation='softmax')(encoder.output)
newmodel = Model(encoder.input,out2)



In [21]:

    
test_X.shape









    Out[21]:





(22544, 122)



In [ ]:

    
from keras import optimizers
opt = optimizers.SGD(lr=0.00001)
newmodel.compile(loss='categorical_crossentropy',
          optimizer=opt, 
          metrics=['accuracy']) 

newmodel.fit(train_X, train_Y,
      epochs=10,
      batch_size=128,
      shuffle=True,
      validation_data=(test_X, test_Y))



In [23]:

    
scores = newmodel.evaluate(test_X, test_Y, verbose=1, steps=50) 
print("Accuracy: ", scores[1])









    



50/50 [==============================] - 1s 11ms/step
Accuracy:  0.7330996990203857



In [24]:

    
newmodel.evaluate??



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	duration	protocol_type	service	flag	src_bytes	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	labels
0	tcp	private	REJ	0	0	...	10	0.04	0.06	0.00	0.00	1.00	1.00	21
0	tcp	private	REJ	0	0	...	1	0.00	0.06	0.00	0.00	1.00	1.00	21
2	tcp	ftp_data	SF	12983	0	...	86	0.61	0.04	0.61	0.02	0.00	0.00	21
0	icmp	eco_i	SF	20	0	...	57	1.00	0.00	1.00	0.28	0.00	0.00	15
1	tcp	telnet	RSTO	0	15	...	86	0.31	0.17	0.03	0.02	0.83	0.71	11