In [1]:
import pandas as pd
In [2]:
columns = ['duration', 'protocol', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'other']
In [3]:
train_df = pd.read_csv('data/nsl_kdd/KDDTrain+.txt', header=None, names=columns)
test_df = pd.read_csv('data/nsl_kdd/KDDTest+.txt', header=None, names=columns)
In [4]:
train_df.head()
Out[4]:
In [5]:
test_df.head()
Out[5]:
Concatenate the datasets into one large dataframe (we will break them up into train-test splits later on)
In [6]:
df = pd.concat([train_df, test_df])
df.shape
Out[6]:
In [7]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns# for pretty plots
The predictor label is attack_type
In [8]:
df.attack_type.unique()
Out[8]:
In [9]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="attack_type", data=df)
plt.xticks(rotation=45)
Out[9]:
Clearly the data is unevenly distributed. Let's make a new variable called attack_set to which contains the superset the attack type belongs to
In [10]:
df['attack_set'] = df['attack_type']
In [11]:
# DOS
df.loc[df.attack_set == 'neptune', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'back', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'land', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'pod', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'smurf', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'teardrop', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'mailbomb', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'processtable', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'udpstorm', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'apache2', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'worm', 'attack_set'] = 'dos'
# User-to-Root (U2R)
df.loc[df.attack_set == 'buffer_overflow', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'loadmodule', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'perl', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'rootkit', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'sqlattack', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'xterm', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'ps', 'attack_set'] = 'u2r'
# Remote-to-Local (R2L)
df.loc[df.attack_set == 'ftp_write', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'guess_passwd', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'imap', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'multihop', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'phf', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'spy', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'warezclient', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'warezmaster', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'xlock', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'xsnoop', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'snmpgetattack', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'httptunnel', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'snmpguess', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'sendmail', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'named', 'attack_set'] = 'r2l'
# Probe attacks
df.loc[df.attack_set == 'satan', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'ipsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'nmap', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'portsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'saint', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'mscan', 'attack_set'] = 'probe'
In [12]:
df.attack_set.unique()
Out[12]:
In [13]:
df.attack_type.value_counts()
Out[13]:
In [14]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="attack_set", data=df)
plt.xticks(rotation=45)
Out[14]:
In [15]:
df.attack_set.value_counts()
Out[15]:
In [16]:
df.attack_set.describe()
Out[16]:
In [17]:
y = df[['attack_type', 'attack_set']].copy()
y.head()
Out[17]:
In [18]:
df.head()
Out[18]:
In [19]:
# Let's remove the labels from the dataset now
del df['attack_type']
del df['attack_set']
In [20]:
df.head()
Out[20]:
In [21]:
df.shape
Out[21]:
The datset contains 148,517 observations and 42 labels!
In [22]:
df.info()
In [23]:
df.describe()
Out[23]:
In [44]:
# This is a subset of the numerical features
num_df = df[['serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count']]
num_df.hist(figsize=(20,15))
Out[44]:
In [24]:
df.protocol.unique()
Out[24]:
In [25]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="protocol", data=df)
plt.xticks(rotation=45)
Out[25]:
In [26]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="service", data=df)
plt.xticks(rotation=45)
Out[26]:
In [27]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="flag", data=df)
plt.xticks(rotation=45)
Out[27]:
In [28]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="land", data=df)
plt.xticks(rotation=45)
Out[28]:
In [29]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="urgent", data=df)
plt.xticks(rotation=45)
Out[29]:
In [30]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="hot", data=df)
plt.xticks(rotation=45)
Out[30]:
In [31]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_failed_logins", data=df)
plt.xticks(rotation=45)
Out[31]:
In [32]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="logged_in", data=df)
plt.xticks(rotation=45)
Out[32]:
In [33]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_compromised", data=df)
plt.xticks(rotation=45)
Out[33]:
In [34]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="root_shell", data=df)
plt.xticks(rotation=45)
Out[34]:
In [35]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="su_attempted", data=df)
plt.xticks(rotation=45)
Out[35]:
In [36]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_root", data=df)
plt.xticks(rotation=45)
Out[36]:
In [37]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="is_host_login", data=df)
plt.xticks(rotation=45)
Out[37]:
In [38]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="is_guest_login", data=df)
plt.xticks(rotation=45)
Out[38]:
In [39]:
df.info()
In [ ]:
cat_df = df[['protocol', 'service', 'flag']]
cat_df.head()
In [45]:
## Convert Categorical Features to Numbers
In [48]:
from sklearn.preprocessing import OneHotEncoder
In [49]:
encoder = OneHotEncoder() # so ML Classifiers willt treat all values the same
In [ ]: