Read Data Sample


In [1]:
import pandas as pd
import numpy as np
import os
import time
from collections import namedtuple
pd.set_option("display.max_rows",35)
%matplotlib inline

In [2]:
class dataset:
    kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels_20percent.pkl")
    kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels_20percent.pkl")
    
    kdd_train_5labels = pd.read_pickle("dataset/kdd_train_5labels.pkl")
    kdd_test_5labels = pd.read_pickle("dataset/kdd_test_5labels.pkl")

In [3]:
dataset.kdd_train_2labels.shape


Out[3]:
(25192, 120)

In [4]:
dataset.kdd_test_2labels.shape


Out[4]:
(11850, 120)

In [5]:
from sklearn import model_selection as ms
from sklearn import preprocessing as pp

class preprocess:
    
    output_columns_2labels = ['is_Attack','is_Normal']
    
    x_input = dataset.kdd_train_2labels.drop(output_columns_2labels, axis = 1)
    y_output = dataset.kdd_train_2labels.loc[:,output_columns_2labels]

    x_test_input = dataset.kdd_test_2labels.drop(output_columns_2labels, axis = 1)
    y_test = dataset.kdd_test_2labels.loc[:,output_columns_2labels]

    ss = pp.StandardScaler()

    x_train = ss.fit_transform(x_input)
    x_test = ss.transform(x_test_input)

    y_train = y_output.values
    y_test = y_test.values

preprocess.x_train.std()


Out[5]:
0.99148920692946862

In [6]:
import tensorflow as tf
from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import basic_rnn_seq2seq
from tensorflow.contrib.rnn import RNNCell, LSTMCell, MultiRNNCell

In [7]:
class network(object):
    
    input_dim = 118
    classes = 2
    hidden_encoder_dim = 118
    hidden_layers = 1
    latent_dim = 10

    hidden_decoder_dim = 118
    lam = 0.01
    
    def __init__(self, classes, hidden_layers, num_of_features):
        self.classes = classes
        self.hidden_layers = hidden_layers
        self.latent_dim = num_of_features
            
    def build_layers(self):
        tf.reset_default_graph()
        #learning_rate = tf.Variable(initial_value=0.001)

        input_dim = self.input_dim
        classes = self.classes
        hidden_encoder_dim = self.hidden_encoder_dim
        hidden_layers = self.hidden_layers
        latent_dim = self.latent_dim
        hidden_decoder_dim = self.hidden_decoder_dim
        lam = self.lam
        
        with tf.variable_scope("Input"):
            self.x_input = tf.placeholder("float", shape=[None, 1, input_dim])
            self.y_input_ = tf.placeholder("float", shape=[None, 1, classes])
            self.keep_prob = tf.placeholder("float")
            self.lr = tf.placeholder("float")
            self.x_list = tf.unstack(self.x_input, axis= 1)
            self.y_list_ = tf.unstack(self.y_input_, axis = 1)
            self.y_ = self.y_list_[0]
            
            #GO = tf.fill((tf.shape(self.x)[0], 1), 0.5)
            
            #y_with_GO = tf.stack([self.y_, GO])
            
        with tf.variable_scope("lstm"):
            multi_cell = MultiRNNCell([LSTMCell(input_dim) for i in range(hidden_layers)] )
            
            self.y, states = basic_rnn_seq2seq(self.x_list, self.y_list_, multi_cell)
            #self.y = tf.slice(self.y, [0, 0], [-1,2])
            
            #self.out = tf.squeeze(self.y)
            
            self.y = tf.layers.dense(self.y[0], classes, activation = tf.nn.softmax)
        with tf.variable_scope("Loss"):
            
            self.regularized_loss = tf.losses.mean_squared_error(self.y_, self.y)
            correct_prediction = tf.equal(tf.argmax(self.y_, 1), tf.argmax(self.y, 1))
            self.tf_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name = "Accuracy")

        with tf.variable_scope("Optimizer"):
            learning_rate=self.lr
            optimizer = tf.train.AdamOptimizer(learning_rate)
            gradients, variables = zip(*optimizer.compute_gradients(self.regularized_loss))
            gradients = [
                None if gradient is None else tf.clip_by_value(gradient, -1, 1)
                for gradient in gradients]
            self.train_op = optimizer.apply_gradients(zip(gradients, variables))
            #self.train_op = optimizer.minimize(self.regularized_loss)
            
        # add op for merging summary
        #self.summary_op = tf.summary.merge_all()
        self.pred = tf.argmax(self.y, axis = 1)
        self.actual = tf.argmax(self.y_, axis = 1)

        # add Saver ops
        self.saver = tf.train.Saver()

batch_iterations = 200

x_train, x_valid, y_train, y_valid, = ms.train_test_split(preprocess.x_train, preprocess.y_train, test_size=0.1) batch_indices = np.array_split(np.arange(x_train.shape[0]), batch_iterations)

for i in batch_indices: print(x_train[i,np.newaxis,:]) print(y_train[i,np.newaxis,:])


In [8]:
import collections

class Train:    
    
    result = namedtuple("score", ['epoch', 'no_of_features','hidden_layers','train_score', 'test_score_20', 'time_taken'])

    predictions = {}

    results = []
    best_acc = 0
    best_acc_global = 0

    def train(epochs, net, h,f, lrs):
        batch_iterations = 200
        train_loss = None
        Train.best_acc = 0
        os.makedirs("dataset/tf_lstm_nsl_kdd-/hidden layers_{}_features count_{}".format(h,f),
                    exist_ok = True)
        with tf.Session() as sess:
            #summary_writer_train = tf.summary.FileWriter('./logs/kdd/VAE/training', graph=sess.graph)
            #summary_writer_valid = tf.summary.FileWriter('./logs/kdd/VAE/validation')

            sess.run(tf.global_variables_initializer())
            start_time = time.perf_counter()
            for lr in lrs:
                for epoch in range(1, (epochs+1)):
                    x_train, x_valid, y_train, y_valid, = ms.train_test_split(preprocess.x_train, 
                                                                              preprocess.y_train, 
                                                                              test_size=0.1)
                    batch_indices = np.array_split(np.arange(x_train.shape[0]), 
                                               batch_iterations)

                    for i in batch_indices:

                        _, train_loss = sess.run([net.train_op, net.regularized_loss], #net.summary_op
                                                              feed_dict={net.x_input: x_train[i,np.newaxis,:], 
                                                                         net.y_input_: y_train[i,np.newaxis,:], 
                                                                         net.keep_prob:1, net.lr:lr})
                        #summary_writer_train.add_summary(summary_str, epoch)
                        if(train_loss > 1e9):
                            print("Step {} | Training Loss: {:.6f}".format(epoch, train_loss))


                    valid_accuracy,valid_loss = sess.run([net.tf_accuracy, net.regularized_loss], #net.summary_op 
                                                          feed_dict={net.x_input: x_valid[:,np.newaxis,:], 
                                                                     net.y_input_: y_valid[:,np.newaxis,:], 
                                                                     net.keep_prob:1, net.lr:lr})
                    #summary_writer_valid.add_summary(summary_str, epoch)



                    accuracy, pred_value, actual_value, y_pred = sess.run([net.tf_accuracy, 
                                                                   net.pred, 
                                                                   net.actual, net.y], 
                                                                  feed_dict={net.x_input: preprocess.x_test[:,np.newaxis,:], 
                                                                             net.y_input_: preprocess.y_test[:,np.newaxis,:], 
                                                                             net.keep_prob:1, net.lr:lr})

                    print("Step {} | Training Loss: {:.6f} | Train Accuracy: {:.6f} | Test Accuracy: {:.6f}".format(epoch, train_loss, valid_accuracy, accuracy))

                    if accuracy > Train.best_acc_global:
                                Train.best_acc_global = accuracy
                                Train.pred_value = pred_value
                                Train.actual_value = actual_value
                                Train.best_parameters = "Hidden Layers:{}, Features Count:{}".format(h, f)

                    if accuracy > Train.best_acc:

                        #net.saver.save(sess, "dataset/tf_vae_only_nsl_kdd_hidden layers_{}_features count_{}".format(epochs,h,f))
                        #Train.results.append(Train.result(epochs, f, h,valid_accuracy, accuracy))
                        #curr_pred = pd.DataFrame({"Attack_prob":y_pred[:,-2], "Normal_prob":y_pred[:, -1]})
                        #Train.predictions.update({"{}_{}_{}".format(epochs,f,h):curr_pred})

                        Train.best_acc = accuracy
                        if not (np.isnan(train_loss)):
                            net.saver.save(sess, 
                                       "dataset/tf_lstm_nsl_kdd-/hidden layers_{}_features count_{}/model"
                                       .format(h,f), 
                                       global_step = epoch, 
                                       write_meta_graph=False)

                        curr_pred = pd.DataFrame({"Attack_prob":y_pred[:,-2], "Normal_prob":y_pred[:, -1], "Prediction":pred_value})
                        Train.predictions.update({"{}_{}_{}".format(epochs*len(lrs),f,h):
                                                  (curr_pred, 
                                                   Train.result(epochs*len(lrs), f, h,valid_accuracy, accuracy, time.perf_counter() - start_time))})

In [9]:
import itertools
class Hyperparameters:
#    features_arr = [2, 4, 8, 16, 32, 64, 128, 256]
#    hidden_layers_arr = [2, 4, 6, 10]
    features_arr = [122] #[4, 8, 16, 32]
    hidden_layers_arr = [2, 4, 6]

    epochs = [5]
    lrs = [1e-2, 1e-4]
    
    for e, h, f in itertools.product(epochs, hidden_layers_arr, features_arr):
        print("Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
        n = network(2,h,f)
        n.build_layers()
        Train.train(e, n, h,f, lrs)


Current Layer Attributes - epochs:5 hidden layers:2 features count:122
Step 1 | Training Loss: 0.008756 | Train Accuracy: 1.000000 | Test Accuracy: 0.966751
Step 2 | Training Loss: 0.000002 | Train Accuracy: 0.999603 | Test Accuracy: 0.941688
Step 3 | Training Loss: 0.000002 | Train Accuracy: 1.000000 | Test Accuracy: 0.948692
Step 4 | Training Loss: 0.000001 | Train Accuracy: 0.998413 | Test Accuracy: 0.996709
Step 5 | Training Loss: 0.000000 | Train Accuracy: 0.999603 | Test Accuracy: 0.994684
Step 1 | Training Loss: 0.000000 | Train Accuracy: 0.999603 | Test Accuracy: 0.994852
Step 2 | Training Loss: 0.000000 | Train Accuracy: 1.000000 | Test Accuracy: 0.994852
Step 3 | Training Loss: 0.000000 | Train Accuracy: 0.999603 | Test Accuracy: 0.994852
Step 4 | Training Loss: 0.000000 | Train Accuracy: 1.000000 | Test Accuracy: 0.994852
Step 5 | Training Loss: 0.000000 | Train Accuracy: 0.999603 | Test Accuracy: 0.994852
Current Layer Attributes - epochs:5 hidden layers:4 features count:122
Step 1 | Training Loss: 0.079646 | Train Accuracy: 0.942460 | Test Accuracy: 0.488354
Step 2 | Training Loss: 0.014094 | Train Accuracy: 0.919444 | Test Accuracy: 0.538650
Step 3 | Training Loss: 0.000388 | Train Accuracy: 0.990873 | Test Accuracy: 0.761435
Step 4 | Training Loss: 0.000023 | Train Accuracy: 0.990873 | Test Accuracy: 0.709789
Step 5 | Training Loss: 0.000059 | Train Accuracy: 0.992063 | Test Accuracy: 0.764051
Step 1 | Training Loss: 0.017268 | Train Accuracy: 0.991667 | Test Accuracy: 0.764219
Step 2 | Training Loss: 0.000093 | Train Accuracy: 0.989286 | Test Accuracy: 0.764304
Step 3 | Training Loss: 0.008610 | Train Accuracy: 0.992063 | Test Accuracy: 0.764557
Step 4 | Training Loss: 0.008566 | Train Accuracy: 0.990873 | Test Accuracy: 0.765316
Step 5 | Training Loss: 0.008485 | Train Accuracy: 0.994048 | Test Accuracy: 0.766498
Current Layer Attributes - epochs:5 hidden layers:6 features count:122
Step 1 | Training Loss: 0.079746 | Train Accuracy: 0.946429 | Test Accuracy: 0.752068
Step 2 | Training Loss: 0.079646 | Train Accuracy: 0.909921 | Test Accuracy: 0.767595
Step 3 | Training Loss: 0.530973 | Train Accuracy: 0.457540 | Test Accuracy: 0.817553
Step 4 | Training Loss: 0.566372 | Train Accuracy: 0.490079 | Test Accuracy: 0.808861
Step 5 | Training Loss: 0.584071 | Train Accuracy: 0.465873 | Test Accuracy: 0.809536
Step 1 | Training Loss: 0.504425 | Train Accuracy: 0.454762 | Test Accuracy: 0.809536
Step 2 | Training Loss: 0.522124 | Train Accuracy: 0.472222 | Test Accuracy: 0.809536
Step 3 | Training Loss: 0.539823 | Train Accuracy: 0.457143 | Test Accuracy: 0.809536
Step 4 | Training Loss: 0.477876 | Train Accuracy: 0.472222 | Test Accuracy: 0.809620
Step 5 | Training Loss: 0.522124 | Train Accuracy: 0.466667 | Test Accuracy: 0.809620

In [10]:
dict1 = {}
dict2 = []
for k, (v1, v2) in Train.predictions.items():
    dict1.update({k: v1})
    dict2.append(v2)

In [11]:
Train.predictions = dict1
Train.results = dict2

In [12]:
df_results = pd.DataFrame(Train.results)

In [13]:
df_results.sort_values(by = 'test_score_20', ascending = False)


Out[13]:
epoch no_of_features hidden_layers train_score test_score_20 time_taken
0 10 122 2 0.998413 0.996709 8.649876
2 10 122 6 0.457540 0.817553 17.537749
1 10 122 4 0.994048 0.766498 39.551998

In [14]:
pd.Panel(Train.predictions).to_pickle("dataset/tf_lstm_nsl_kdd_predictions-.pkl")
df_results.to_pickle("dataset/tf_lstm_nsl_kdd_scores-.pkl")

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=4)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j].round(4),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def plot(actual_value, pred_value):
    from sklearn.metrics import confusion_matrix
    cm_2labels = confusion_matrix(y_pred = pred_value, y_true = actual_value)
    plt.figure(figsize=[6,6])
    plot_confusion_matrix(cm_2labels, preprocess.output_columns_2labels, normalize = True,
                         title = Train.best_parameters)

In [16]:
plot(actual_value = Train.actual_value, pred_value = Train.pred_value)


Normalized confusion matrix
[[ 0.9969  0.0031]
 [ 0.0042  0.9958]]

In [17]:
#4.5 GB
pd.Series(Train.pred_value).to_csv('LSTM_prediction_values-.csv')

In [ ]: