[1]
duration
- connection time in seconds.protocal_type
- i.e. TCP, UDPservice
- Netword service on destination (i.e. HTTP, Telnet)source_bytes
- amount of data from source to destination.flag
- normal or error status of connection.land
- Connection is from/to the same host/port: "1", else "0".wrong_fragment
- number of "wrong" fragments.urgent
- number of urgent packets.host
- number of "hot" indicators.num_failed_logins
- number of attempts.logged_in
- succes: "1", else "0".compromised
- number of "compromised" conditions.root_shell
- if root shell obtained: "1", else "0".su_attempted
- if "su" command attempted: "1", else "0".num_root
- number of root accesses.num_file_creations
- number of creation operations.num_shells
- number of shell prompts.num_access_files
- number of operations on access control files.num_outbound_cmds
- number per session.is_hot_login
- if login on "hot" list: "1", else "0".is_guest_login
- if guest: "1", else "0".count
- number of connections to the same host.serror_rate
- % of connections with "SYN" errors.rerror_rate
- % of connections with "REJ" errors.same_srv_rate
- % of connections to the same service.diff_srv_rate
- % of connections to different services.srv_count
- number of connections to the same service.srv_serror_rate
- % of connections with "SYN" errors.srv_rerror_rate
- % of connections with "REJ" errors.srv_diff_host_rate
- % of connections to different hosts.[2]
)
In [1]:
from array import array
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.datasets import fetch_kddcup99
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
%matplotlib inline
In [2]:
dataset_part = fetch_kddcup99(percent10=True) # Over 600 MB in memeory.
# dataset_full = fetch_kddcup99(percent10=False) # Crashed my computer with 16 GB of RAM.
In [3]:
dataset_part.data[0] # Sample of TCP record.
Out[3]:
In [4]:
len(set(dataset_part.target)) # Number of unique classifications.
Out[4]:
In [ ]:
In [5]:
df = pd.DataFrame(dataset_part.data)
In [6]:
df.head(1)
Out[6]:
In [7]:
df = df.apply(pd.to_numeric, errors='ignore')
In [8]:
# Example from http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
'''
le = preprocessing.LabelEncoder()
le.fit(list(names))
# le.classes_ # Shows all labels.
print(le.transform([b'icmpeco_iSF', b'icmpecr_iSF', b'icmpred_iSF']) )
print(le.inverse_transform([0, 0, 1, 2]))
'''
# https://datascience.stackexchange.com/questions/16728/could-not-convert-string-to-float-error-on-kddcup99-dataset
for column in df.columns:
if df[column].dtype == object:
le = preprocessing.LabelEncoder()
df[column] = le.fit_transform(df[column])
In [9]:
df.head(1) # All strings removed.
Out[9]:
In [10]:
X = df.values
le = preprocessing.LabelEncoder()
y = le.fit_transform(dataset_part.target)
y_dict = dict(zip(y,le.classes_)) # Saved for later lookup.
In [11]:
# Test options and evaluation metric
N_SPLITS = 7
SCORING = 'accuracy'
In [12]:
# Split-out validation dataset
test_size=0.33
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=SEED)
In [21]:
# Algorithms
models = [
#('LR', LogisticRegression()),
('LDA', LinearDiscriminantAnalysis()),
#('KNN', KNeighborsClassifier()),
#('KMN', KMeans()),
#('CART', DecisionTreeClassifier()),
#('NB', GaussianNB()),
]
# evaluate each model in turn
results = []
names = []
print('{:8}{:^8}{:^8}'.format('Model','mean','std'))
print('-' * 23)
for name, model in models:
kfold = KFold(n_splits=N_SPLITS, random_state=SEED)
%timeit -n1 cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=SCORING)
results.append(cv_results)
names.append(name)
print('{:8}{:^8.2%}{:^8.2%}'.format(name, cv_results.mean(), cv_results.std()))
In [19]:
print(*cv_results)
In [ ]:
previous_results = '''
LR: 98.87% (0.10%)
LDA: 99.49% (0.05%)
KNN: 99.84% (0.01%) <-- slow
CART: 99.94% (0.00%)
NB: 93.96% (0.96%)
SVM: <-- very slow
'''
In [ ]:
In [ ]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(y)
plt.show()
In [ ]:
In [ ]:
test = [0, 1, 22, 9, 181, 5450, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9, 9, 1.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0]
print(neigh.predict([test]))
print(neigh.predict_proba([test])) # TODO: research this.
In [ ]:
[1]
- KDD Cup 99 dataset[2]
- M. Tavallaee, E. Bagheri, W. Lu, and A. Ghorbani, “A Detailed Analysis of the KDD CUP 99 Data Set,” Submitted to Second IEEE Symposium on Computational Intelligence for Security and Defense Applications (CISDA), 2009. linkLogs
Labelled datasets
In [ ]:
In [ ]:
print('{:10}{:10}{:10}'.format('Model','mean','std'))
print('LDA: 99.49% (0.05%)')
In [17]:
print('{:8}{:^8}{:^8}'.format('Model','mean','std'))
print('-' * 23)
print('{:8}{:^8.2%}{:^8.2%}'.format('LDA', .9949, .0005))
In [ ]: