Tavallaee, M., Bagheri, E., Lu, W., and Ghorbani, A. A. (2009). A detailed analysis of the KDD CUP 99 data set. In Proceedings of the Second IEEE Symposium on Computational Intelligence for Security and Defence Applications 2009.
In [1]:
# Librerias de Python
import time
import copy
# Dependencias internas
from learninspy.core.autoencoder import StackedAutoencoder
from learninspy.core.model import NetworkParameters
from learninspy.core.optimization import OptimizerParameters
from learninspy.core.stops import criterion
from learninspy.utils.data import split_data, label_data
from learninspy.utils.data import StandardScaler, LocalLabeledDataSet
from learninspy.utils.evaluation import ClassificationMetrics
from learninspy.utils.plots import plot_neurons, plot_fitting, plot_confusion_matrix
# Dependencias externas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
pathtrain = "/home/leeandro04/Documentos/Datos/KDD/NSL_KDD/20 Percent Training Set.csv"
pathtest = "/home/leeandro04/Documentos/Datos/KDD/NSL_KDD/KDDTest+.csv"
pathtest21 = "/home/leeandro04/Documentos/Datos/KDD/NSL_KDD/KDDTest-21.txt"
alltrain = pd.read_csv(pathtrain, header=None)
test = pd.read_csv(pathtest, header=None)
test21 = pd.read_csv(pathtest21, header=None)
In [3]:
alltrain
Out[3]:
In [4]:
# Dropping
drop = [0, 1, 2, 3, 4, 5, 6, 7, 8, # Basic Features
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, # Content Features
]
for d in drop:
alltrain.drop(d, axis=1, inplace=True)
test.drop(d, axis=1, inplace=True)
test21.drop(d, axis=1, inplace=True)
In [5]:
alltrain.describe()
Out[5]:
In [6]:
# Explicacion de ataques en los datos etiquetados de KDDTrain+
dict_attacks = {'dos': ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop'],
'u2r': ['buffer_overflow', 'loadmodule', 'perl', 'rootkit'],
'r2l': ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster'],
'probe': ['ipsweep', 'nmap', 'portsweep', 'satan']}
In [7]:
# Separo lo normal de los ataques en base a un Ground Truth (columna de etiquetas)
normal = alltrain[alltrain[41] == 'normal']
attack = alltrain[alltrain[41] != 'normal']
# Tiro las columnas de etiquetas
normal = normal.ix[:, :40]
attack = attack.ix[:, :40]
# Etiqueto datos
normal = label_data(normal.values, [0]*len(normal.values))
attack = label_data(attack.values, [1]*len(attack.values))
train, valid = split_data(normal, fractions=[0.7, 0.3])
print "Dimension de características: ", len(normal[0].features)
print "Cantidad de ejemplos normales: ", len(normal)
print "Cantidad de ejemplos de ataques: ", len(attack)
print "Cantidad de train: ", len(train)
print "Cantidad de valid: ", len(valid)
In [8]:
# Defino configuración del Stacked AutoEncoder y entreno
net_params = NetworkParameters(units_layers=[19, 10, 2], activation='ReLU', classification=True,
dropout_ratios=[0.2, 0.0], strength_l1=1e-6, strength_l2=5e-5)
saekdd = StackedAutoencoder(net_params, dropout=[0.0, 0.0])
In [9]:
# Para el pre-entrenamiento
local_stops_sae = [criterion['MaxIterations'](30),
criterion['AchieveTolerance'](0.95, key='hits')]
global_stops_sae = [criterion['MaxIterations'](20),
criterion['AchieveTolerance'](0.95, key='hits')]
opt_params_sae = OptimizerParameters(algorithm='Adadelta',
options={'step-rate': 1, 'decay': 0.995, 'momentum': 0.7, 'offset': 1e-8},
stops=local_stops_sae, merge_criter='w_avg', merge_goal='hits')
In [10]:
# Para el ajuste fino
local_stops_ft = [criterion['MaxIterations'](5),
criterion['AchieveTolerance'](0.9, key='hits')]
global_stops_ft = [criterion['MaxIterations'](20),
criterion['AchieveTolerance'](0.85, key='hits')]
opt_params_ft = OptimizerParameters(algorithm='GD',
options={'step-rate': 1e-3, 'momentum': 0.9, 'momentum_type': 'nesterov'},
stops=local_stops_ft, merge_criter='w_avg')
In [11]:
hits_valid = saekdd.fit(train, valid, mini_batch=20, parallelism=10, valid_iters=1,
stops=global_stops_sae, optimizer_params=opt_params_sae, reproducible=True)
In [12]:
hits_attack, predictions = saekdd.evaluate(attack[1000:5000], predictions=True)
print "Hits de valid: ", hits_valid
print "Hits de ataques: ", hits_attack
print "Accuracy de ataques: ", len(filter(lambda (lp, p): lp.label == p, zip(attack[1000:5000], predictions))) / float(len(attack[1000:5000]))
In [13]:
train2, valid2 = split_data(train+attack[:1000], fractions=[0.7, 0.3])
hits_total = saekdd.finetune(train2, valid2, mini_batch=20, parallelism=10, stops=global_stops_ft, valid_iters=1,
optimizer_params=opt_params_sae, keep_best=True)
In [14]:
print "Metricas: "
hits, predictions = saekdd.evaluate(valid+attack[1000:], predictions=True)
labels = map(lambda lp: float(lp.label), valid+attack[1000:])
metrics = ClassificationMetrics(zip(predictions, labels), 2)
print "Total of normal events: ", len(valid)
print "Precision of normal: ", metrics.precision(label=0)
print "Recall of normal: ", metrics.recall(label=0)
print "F1-Score of normal: ", metrics.f_measure(label=0)
print "Accuracy of normal: ", metrics.accuracy(label=0)
print ""
print "Total of attack events: ", len(attack[1000:5000])
print "Precision of attacks: ", metrics.precision(label=1)
print "Recall of attacks: ", metrics.recall(label=1)
print "F1-Score of attacks: ", metrics.f_measure(label=1)
print "Accuracy of attacks: ", metrics.accuracy(label=1)
print ""
print "Precision of total: ", metrics.precision()
print "Recall of total: ", metrics.recall()
print "F1-Score of total: ", metrics.f_measure()
print "Accuracy of total: ", metrics.accuracy()
plot_confusion_matrix(metrics.confusion_matrix(), show=True)
reduction = 1. - (metrics.confusion_matrix()[0][1]+metrics.confusion_matrix()[1][1]) / float(sum(sum(metrics.confusion_matrix())))
print "Reduction of total: ", reduction * 100,"%"
In [15]:
filename = '/tmp/model/nsl-kdd_learninspy_conft'
saekdd.save(filename)
print "Modelo StackedAutoencoder:"
print str(saekdd.params)
print "Optimización no-supervisada:"
print str(opt_params_sae)
print "Fine-tuning supervisado:"
print str(opt_params_ft)
In [16]:
test.describe()
Out[16]:
In [17]:
# Separo lo normal de los ataques en base a un Ground Truth (columna de etiquetas)
normal = test[test[41] == 'normal']
anomal = test[test[41] != 'normal']
# Tiro las columnas de etiquetas
normal = normal.ix[:, :40]
anomal = anomal.ix[:, :40]
# Etiqueto datos
normal = label_data(normal.values, [0]*len(normal.values))
anomal = label_data(anomal.values, [1]*len(anomal.values))
In [18]:
print "Metricas: "
hits, predictions = saekdd.evaluate(normal+anomal, predictions=True)
labels = map(lambda lp: float(lp.label), normal+anomal)
metrics = ClassificationMetrics(zip(predictions, labels), 2)
print "Precision of normal: ", metrics.precision(label=0)
print "Recall of normal: ", metrics.recall(label=0)
print "F1-Score of normal: ", metrics.f_measure(label=0)
print "Accuracy of normal: ", metrics.accuracy(label=0)
print ""
print "Precision of attacks: ", metrics.precision(label=1)
print "Recall of attacks: ", metrics.recall(label=1)
print "F1-Score of attacks: ", metrics.f_measure(label=1)
print "Accuracy of attacks: ", metrics.accuracy(label=1)
print ""
print "Precision of total: ", metrics.precision()
print "Recall of total: ", metrics.recall()
print "F1-Score of total: ", metrics.f_measure()
print "Accuracy of total: ", metrics.accuracy()
plot_confusion_matrix(metrics.confusion_matrix(), show=True)
reduction = 1. - (metrics.confusion_matrix()[0][1]+metrics.confusion_matrix()[1][1]) / float(sum(sum(metrics.confusion_matrix())))
print "Reduction of total: ", reduction * 100,"%"
In [19]:
test21.describe()
Out[19]:
In [20]:
# Separo lo normal de los ataques en base a un Ground Truth (columna de etiquetas)
normal = test21[test21[41] == 'normal']
anomal = test21[test21[41] != 'normal']
# Tiro las columnas de etiquetas
normal = normal.ix[:, :40]
anomal = anomal.ix[:, :40]
# Etiqueto datos
normal = label_data(normal.values, [0]*len(normal.values))
anomal = label_data(anomal.values, [1]*len(anomal.values))
In [21]:
print "Metricas: "
hits, predictions = saekdd.evaluate(normal+anomal, predictions=True)
labels = map(lambda lp: float(lp.label), normal+anomal)
metrics = ClassificationMetrics(zip(predictions, labels), 2)
print "Precision of normal: ", metrics.precision(label=0)
print "Recall of normal: ", metrics.recall(label=0)
print "F1-Score of normal: ", metrics.f_measure(label=0)
print "Accuracy of normal: ", metrics.accuracy(label=0)
print ""
print "Precision of attacks: ", metrics.precision(label=1)
print "Recall of attacks: ", metrics.recall(label=1)
print "F1-Score of attacks: ", metrics.f_measure(label=1)
print "Accuracy of attacks: ", metrics.accuracy(label=1)
print ""
print "Precision of total: ", metrics.precision()
print "Recall of total: ", metrics.recall()
print "F1-Score of total: ", metrics.f_measure()
print "Accuracy of total: ", metrics.accuracy()
plot_confusion_matrix(metrics.confusion_matrix(), show=True)
reduction = 1. - (metrics.confusion_matrix()[0][1]+metrics.confusion_matrix()[1][1]) / float(sum(sum(metrics.confusion_matrix())))
print "Reduction of total: ", reduction * 100,"%"
In [22]:
from learninspy.utils.plots import plot_fitting
print "Desempeño del ajuste fino"
plot_fitting(saekdd)
print "Pesos sinápticos del AE"
plot_neurons(saekdd)
In [23]:
data = normal
x = data[100].features
en1 = saekdd.list_layers[0].encode(x).matrix
print "Patrón original: "
print x
print "Patrón codificado: "
print list(en1.T[0])
print ""
median_feat = np.median(map(lambda r: r.features, data), 0)
median_encod = np.median(map(lambda r: saekdd.list_layers[0].encode(r.features).matrix.T[0], data), 0)
print "Mediana de features originales"
plt.stem(median_feat)
plt.show()
print ""
print "Mediana de features codificadas"
plt.stem(median_encod)
plt.show()
In [ ]: