feature name | description | type |
---|---|---|
duration | length (number of seconds) of the connection | continuous |
protocol_type | type of the protocol, e.g. tcp, udp, etc. | discrete |
service | network service on the destination, e.g., http, telnet, etc. | discrete |
src_bytes | number of data bytes from source to destination | continuous |
dst_bytes | number of data bytes from destination to source | continuous |
flag | normal or error status of the connection | discrete |
land | 1 if connection is from/to the same host/port; 0 otherwise | discrete |
wrong_fragment | number of ''wrong'' fragments | continuous |
urgent | number of urgent packets | continuous |
feature name | description | type |
---|---|---|
hot | number of ''hot'' indicators | continuous |
num_failed_logins | number of failed login attempts | continuous |
logged_in | 1 if successfully logged in; 0 otherwise | discrete |
num_compromised | number of ''compromised'' conditions | continuous |
root_shell | 1 if root shell is obtained; 0 otherwise | discrete |
su_attempted | 1 if ''su root'' command attempted; 0 otherwise | discrete |
num_root | number of ''root'' accesses | continuous |
num_file_creations | number of file creation operations | continuous |
num_shells | number of shell prompts | continuous |
num_access_files | number of operations on access control files | continuous |
num_outbound_cmds | number of outbound commands in an ftp session | continuous |
is_hot_login | 1 if the login belongs to the ''hot'' list; 0 otherwise | discrete |
is_guest_login | 1 if the login is a ''guest''login; 0 otherwise | discrete |
In [1]:
%matplotlib inline
#%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import constants
In [2]:
data_10_percent = 'kddcup.data_10_percent'
data_full = 'kddcup.data'
data = pd.read_csv(data_10_percent, names=constants.names)
# Remove Traffic features computed using a two-second time window
data.drop(constants.traffic_features, inplace=True, axis=1)
In [3]:
data.head()
Out[3]:
In [4]:
data.describe()
Out[4]:
In [5]:
from sklearn import preprocessing
le_dicts = {}
for categorical_name in constants.categorical_names:
le = preprocessing.LabelEncoder()
le.fit(data[categorical_name])
le_dicts[categorical_name] = dict(zip(le.transform(le.classes_), le.classes_))
print(categorical_name, ':', le_dicts[categorical_name])
data[categorical_name + '_num'] = le.fit_transform(data[categorical_name])
In [6]:
data['label'].value_counts()
Out[6]:
In [7]:
data['label_binary_num'] = data.label.apply(lambda label: 0 if label == 'normal.' else 1)
data['label_binary_num'].value_counts()
Out[7]:
In [8]:
data['label_four'] = data.label.apply(lambda label: constants.label_to_four_attack_class[label])
data['label_four_num'] = data.label_four.apply(lambda label: constants.five_classes_to_num[label])
pd.value_counts(data['label_four'], sort=True).plot.bar()
Out[8]:
In [9]:
#all data
pd.value_counts(data['protocol_type'], sort=True).plot.bar()
Out[9]:
In [10]:
#all data according to label_binary
pd.pivot_table(data[['protocol_type_num', 'label_binary_num']].assign(count=1),
index=['label_binary_num'],
columns=['protocol_type_num'],
aggfunc='count').plot(kind='bar', color=constants.my_colors)
handles = [mpatches.Patch(label=le_dicts['protocol_type'][i],
color=constants.my_colors[i]) for i in sorted(le_dicts['protocol_type'])]
plt.legend(handles=handles, loc=2)
plt.show()
In [11]:
pd.pivot_table(data[['protocol_type', 'label_binary_num']].assign(count=1),
index=['label_binary_num'],
columns=['protocol_type'],
aggfunc='count')
Out[11]:
In [12]:
data['service'].value_counts()[:10]
Out[12]:
In [13]:
#all data by service
pd.value_counts(data['service'], sort=True).mask(lambda x: x < 200)\
.dropna()\
.plot(kind='bar', logy=True, figsize=(20, 5))
Out[13]:
In [14]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 10))
pd.pivot_table(data[['service_num', 'service', 'label_binary_num']].assign(count=1),
index=['service'],
columns=['label_binary_num'],
aggfunc='count')['count'][0].mask(lambda x: x < 200)\
.dropna().sort_values(ascending=False).plot(kind='bar',
logy=True,
ax=axes[0])
pd.pivot_table(data[['service_num', 'service', 'label_binary_num']].assign(count=1),
index=['service'],
columns=['label_binary_num'],
aggfunc='count')['count'][1].mask(lambda x: x < 200)\
.dropna().sort_values(ascending=False).plot(kind='bar',
logy=True,
ax=axes[1])
Out[14]:
In [15]:
# without NA in any column
pd.pivot_table(data[['service', 'label_binary_num']].assign(count=1),
index=['service'],
columns=['label_binary_num'],
aggfunc='count').sort_values(('count', 0), ascending=False).dropna()
Out[15]:
In [16]:
pd.value_counts(data['flag'], sort=True).plot(kind='bar', logy=True, figsize=(15, 5))
Out[16]:
In [17]:
#flag according to label_binary
pd.pivot_table(data[['flag_num', 'label_binary_num']].assign(count=1),
index=['label_binary_num'],
columns=['flag_num'],
aggfunc='count').plot(kind='bar', color=constants.my_colors, logy=True, legend=False, figsize=(15, 5))
handles = [mpatches.Patch(label=le_dicts['flag'][i],
color=constants.my_colors[i]) for i in sorted(le_dicts['flag'])]
plt.legend(handles=handles)
plt.show()
In [18]:
pd.pivot_table(data[['flag', 'label_binary_num']].assign(count=1),
index=['label_binary_num'],
columns=['flag'],
aggfunc='count')
Out[18]:
In [19]:
# Corr with binary label
data.drop(constants.categorical_names + constants.label_names, axis=1).corrwith(data.label_binary_num).sort_values()
Out[19]:
In [20]:
# Corr with 5 labels
data.drop(constants.categorical_names + constants.label_names, axis=1).corrwith(data.label_four_num).sort_values()
Out[20]:
In [21]:
# corr heatmap
# last 2 are label_binary_num and label_four_num that's why it so hot
plt.figure(figsize=(7,7))
plt.matshow(data.drop(constants.categorical_names + \
['label', 'label_four'] + \
constants.names_without_changes, axis=1).corr(), fignum=1)
Out[21]:
In [22]:
for i, elem in enumerate(data.drop(constants.categorical_names + \
['label', 'label_four'] + \
constants.names_without_changes, axis=1).columns.tolist()):
print(i, elem)
In [23]:
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators=500,
random_state=42)
data_test = data.drop(constants.categorical_names +\
constants.label_names +\
constants.names_without_changes, axis=1)
In [24]:
#for 2 labels
forest.fit(data_test, data['label_binary_num'])
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(15, 4))
plt.bar(range(data_test.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(data_test.shape[1]), indices)
plt.xlim([-1, data_test.shape[1]])
plt.show()
In [25]:
#for 5 labels
forest.fit(data_test, data['label_four_num'])
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(15, 4))
plt.bar(range(data_test.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(data_test.shape[1]), indices)
plt.xlim([-1, data_test.shape[1]])
plt.show()
In [26]:
for i, elem in enumerate(data_test.columns.tolist()):
print(i, elem)