notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
columns = ['duration', 'protocol', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
           'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
           'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
           'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'other']



In [3]:

    
train_df = pd.read_csv('data/nsl_kdd/KDDTrain+.txt', header=None, names=columns)
test_df = pd.read_csv('data/nsl_kdd/KDDTest+.txt', header=None, names=columns)



In [4]:

    
train_df.head()









    Out[4]:







  
    
      
      duration
      protocol
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      attack_type
      other
    
  
  
    
      0
      0
      tcp
      ftp_data
      SF
      491
      0
      0
      0
      0
      0
      ...
      0.17
      0.03
      0.17
      0.00
      0.00
      0.00
      0.05
      0.00
      normal
      20
    
    
      1
      0
      udp
      other
      SF
      146
      0
      0
      0
      0
      0
      ...
      0.00
      0.60
      0.88
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
      15
    
    
      2
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      0.10
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
      19
    
    
      3
      0
      tcp
      http
      SF
      232
      8153
      0
      0
      0
      0
      ...
      1.00
      0.00
      0.03
      0.04
      0.03
      0.01
      0.00
      0.01
      normal
      21
    
    
      4
      0
      tcp
      http
      SF
      199
      420
      0
      0
      0
      0
      ...
      1.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
      21
    
  

5 rows × 43 columns



In [5]:

    
test_df.head()









    Out[5]:







  
    
      
      duration
      protocol
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      attack_type
      other
    
  
  
    
      0
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      0.04
      0.06
      0.00
      0.00
      0.0
      0.0
      1.00
      1.00
      neptune
      21
    
    
      1
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      0.00
      0.06
      0.00
      0.00
      0.0
      0.0
      1.00
      1.00
      neptune
      21
    
    
      2
      2
      tcp
      ftp_data
      SF
      12983
      0
      0
      0
      0
      0
      ...
      0.61
      0.04
      0.61
      0.02
      0.0
      0.0
      0.00
      0.00
      normal
      21
    
    
      3
      0
      icmp
      eco_i
      SF
      20
      0
      0
      0
      0
      0
      ...
      1.00
      0.00
      1.00
      0.28
      0.0
      0.0
      0.00
      0.00
      saint
      15
    
    
      4
      1
      tcp
      telnet
      RSTO
      0
      15
      0
      0
      0
      0
      ...
      0.31
      0.17
      0.03
      0.02
      0.0
      0.0
      0.83
      0.71
      mscan
      11
    
  

5 rows × 43 columns

Concatenate the datasets into one large dataframe (we will break them up into train-test splits later on)



In [6]:

    
df = pd.concat([train_df, test_df])
df.shape









    Out[6]:





(148517, 43)

Explore Attributes



In [7]:

    
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns# for pretty plots

The Label

The predictor label is attack_type



In [8]:

    
df.attack_type.unique()









    Out[8]:





array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl', 'saint',
       'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel',
       'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm',
       'xlock', 'xsnoop', 'sqlattack', 'udpstorm'], dtype=object)



In [9]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="attack_type", data=df)
plt.xticks(rotation=45)









    Out[9]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39]), <a list of 40 Text xticklabel objects>)

Clearly the data is unevenly distributed. Let's make a new variable called attack_set to which contains the superset the attack type belongs to



In [10]:

    
df['attack_set'] = df['attack_type']



In [11]:

    
# DOS
df.loc[df.attack_set == 'neptune', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'back', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'land', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'pod', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'smurf', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'teardrop', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'mailbomb', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'processtable', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'udpstorm', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'apache2', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'worm', 'attack_set'] = 'dos'


# User-to-Root (U2R)
df.loc[df.attack_set == 'buffer_overflow', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'loadmodule', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'perl', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'rootkit', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'sqlattack', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'xterm', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'ps', 'attack_set'] = 'u2r'

# Remote-to-Local (R2L)
df.loc[df.attack_set == 'ftp_write', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'guess_passwd', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'imap', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'multihop', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'phf', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'spy', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'warezclient', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'warezmaster', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'xlock', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'xsnoop', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'snmpgetattack', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'httptunnel', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'snmpguess', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'sendmail', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'named', 'attack_set'] = 'r2l'

# Probe attacks
df.loc[df.attack_set == 'satan', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'ipsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'nmap', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'portsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'saint', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'mscan', 'attack_set'] = 'probe'



In [12]:

    
df.attack_set.unique()









    Out[12]:





array(['normal', 'dos', 'r2l', 'probe', 'u2r'], dtype=object)



In [13]:

    
df.attack_type.value_counts()









    Out[13]:





normal             77054
neptune            45871
satan               4368
ipsweep             3740
smurf               3311
portsweep           3088
nmap                1566
back                1315
guess_passwd        1284
mscan                996
warezmaster          964
teardrop             904
warezclient          890
apache2              737
processtable         685
snmpguess            331
saint                319
mailbomb             293
pod                  242
snmpgetattack        178
httptunnel           133
buffer_overflow       50
land                  25
multihop              25
rootkit               23
named                 17
ps                    15
sendmail              14
xterm                 13
imap                  12
ftp_write             11
loadmodule            11
xlock                  9
phf                    6
perl                   5
xsnoop                 4
worm                   2
spy                    2
sqlattack              2
udpstorm               2
Name: attack_type, dtype: int64



In [14]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="attack_set", data=df)
plt.xticks(rotation=45)









    Out[14]:





(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)



In [15]:

    
df.attack_set.value_counts()









    Out[15]:





normal    77054
dos       53387
probe     14077
r2l        3880
u2r         119
Name: attack_set, dtype: int64



In [16]:

    
df.attack_set.describe()









    Out[16]:





count     148517
unique         5
top       normal
freq       77054
Name: attack_set, dtype: object



In [17]:

    
y = df[['attack_type', 'attack_set']].copy()
y.head()









    Out[17]:







  
    
      
      attack_type
      attack_set
    
  
  
    
      0
      normal
      normal
    
    
      1
      normal
      normal
    
    
      2
      neptune
      dos
    
    
      3
      normal
      normal
    
    
      4
      normal
      normal



In [18]:

    
df.head()









    Out[18]:







  
    
      
      duration
      protocol
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      attack_type
      other
      attack_set
    
  
  
    
      0
      0
      tcp
      ftp_data
      SF
      491
      0
      0
      0
      0
      0
      ...
      0.03
      0.17
      0.00
      0.00
      0.00
      0.05
      0.00
      normal
      20
      normal
    
    
      1
      0
      udp
      other
      SF
      146
      0
      0
      0
      0
      0
      ...
      0.60
      0.88
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
      15
      normal
    
    
      2
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
      19
      dos
    
    
      3
      0
      tcp
      http
      SF
      232
      8153
      0
      0
      0
      0
      ...
      0.00
      0.03
      0.04
      0.03
      0.01
      0.00
      0.01
      normal
      21
      normal
    
    
      4
      0
      tcp
      http
      SF
      199
      420
      0
      0
      0
      0
      ...
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
      21
      normal
    
  

5 rows × 44 columns



In [19]:

    
# Let's remove the labels from the dataset now
del df['attack_type']
del df['attack_set']



In [20]:

    
df.head()









    Out[20]:







  
    
      
      duration
      protocol
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      other
    
  
  
    
      0
      0
      tcp
      ftp_data
      SF
      491
      0
      0
      0
      0
      0
      ...
      25
      0.17
      0.03
      0.17
      0.00
      0.00
      0.00
      0.05
      0.00
      20
    
    
      1
      0
      udp
      other
      SF
      146
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.60
      0.88
      0.00
      0.00
      0.00
      0.00
      0.00
      15
    
    
      2
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      26
      0.10
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      19
    
    
      3
      0
      tcp
      http
      SF
      232
      8153
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.03
      0.04
      0.03
      0.01
      0.00
      0.01
      21
    
    
      4
      0
      tcp
      http
      SF
      199
      420
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      21
    
  

5 rows × 42 columns

The Features



In [21]:

    
df.shape









    Out[21]:





(148517, 42)

The datset contains 148,517 observations and 42 labels!



In [22]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 148517 entries, 0 to 22543
Data columns (total 42 columns):
duration                       148517 non-null int64
protocol                       148517 non-null object
service                        148517 non-null object
flag                           148517 non-null object
src_bytes                      148517 non-null int64
dst_bytes                      148517 non-null int64
land                           148517 non-null int64
wrong_fragment                 148517 non-null int64
urgent                         148517 non-null int64
hot                            148517 non-null int64
num_failed_logins              148517 non-null int64
logged_in                      148517 non-null int64
num_compromised                148517 non-null int64
root_shell                     148517 non-null int64
su_attempted                   148517 non-null int64
num_root                       148517 non-null int64
num_file_creations             148517 non-null int64
num_shells                     148517 non-null int64
num_access_files               148517 non-null int64
num_outbound_cmds              148517 non-null int64
is_host_login                  148517 non-null int64
is_guest_login                 148517 non-null int64
count                          148517 non-null int64
srv_count                      148517 non-null int64
serror_rate                    148517 non-null float64
srv_serror_rate                148517 non-null float64
rerror_rate                    148517 non-null float64
srv_rerror_rate                148517 non-null float64
same_srv_rate                  148517 non-null float64
diff_srv_rate                  148517 non-null float64
srv_diff_host_rate             148517 non-null float64
dst_host_count                 148517 non-null int64
dst_host_srv_count             148517 non-null int64
dst_host_same_srv_rate         148517 non-null float64
dst_host_diff_srv_rate         148517 non-null float64
dst_host_same_src_port_rate    148517 non-null float64
dst_host_srv_diff_host_rate    148517 non-null float64
dst_host_serror_rate           148517 non-null float64
dst_host_srv_serror_rate       148517 non-null float64
dst_host_rerror_rate           148517 non-null float64
dst_host_srv_rerror_rate       148517 non-null float64
other                          148517 non-null int64
dtypes: float64(15), int64(24), object(3)
memory usage: 53.7+ MB



In [23]:

    
df.describe()









    Out[23]:







  
    
      
      duration
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      num_failed_logins
      logged_in
      num_compromised
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      other
    
  
  
    
      count
      148517.000000
      1.485170e+05
      1.485170e+05
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      ...
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
      148517.000000
    
    
      mean
      276.779305
      4.022795e+04
      1.708885e+04
      0.000215
      0.020523
      0.000202
      0.189379
      0.004323
      0.402789
      0.255062
      ...
      119.462661
      0.534521
      0.084103
      0.145932
      0.030584
      0.256122
      0.251304
      0.136220
      0.136397
      19.278480
    
    
      std
      2460.683131
      5.409612e+06
      3.703525e+06
      0.014677
      0.240069
      0.019417
      2.013160
      0.072248
      0.490461
      22.231375
      ...
      111.232318
      0.448061
      0.194102
      0.308638
      0.108975
      0.428500
      0.429719
      0.322741
      0.335282
      2.739757
    
    
      min
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      11.000000
      0.050000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      18.000000
    
    
      50%
      0.000000
      4.400000e+01
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      72.000000
      0.600000
      0.020000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      20.000000
    
    
      75%
      0.000000
      2.780000e+02
      5.710000e+02
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      255.000000
      1.000000
      0.070000
      0.050000
      0.010000
      0.600000
      0.500000
      0.000000
      0.000000
      21.000000
    
    
      max
      57715.000000
      1.379964e+09
      1.309937e+09
      1.000000
      3.000000
      3.000000
      101.000000
      5.000000
      1.000000
      7479.000000
      ...
      255.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      21.000000
    
  

8 rows × 39 columns

Numerical Features



In [44]:

    
# This is a subset of the numerical features
num_df = df[['serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count']]
num_df.hist(figsize=(20,15))









    Out[44]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11bce9860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a3454a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a8dcf60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11a93d048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a982fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a9b24a8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11aed6940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11aee64a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x127db4710>]], dtype=object)

Categorical Features



In [24]:

    
df.protocol.unique()









    Out[24]:





array(['tcp', 'udp', 'icmp'], dtype=object)



In [25]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="protocol", data=df)
plt.xticks(rotation=45)









    Out[25]:





(array([0, 1, 2]), <a list of 3 Text xticklabel objects>)



In [26]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="service", data=df)
plt.xticks(rotation=45)









    Out[26]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69]), <a list of 70 Text xticklabel objects>)



In [27]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="flag", data=df)
plt.xticks(rotation=45)









    Out[27]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 <a list of 11 Text xticklabel objects>)



In [28]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="land", data=df)
plt.xticks(rotation=45)









    Out[28]:





(array([0, 1]), <a list of 2 Text xticklabel objects>)



In [29]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="urgent", data=df)
plt.xticks(rotation=45)









    Out[29]:





(array([0, 1, 2, 3]), <a list of 4 Text xticklabel objects>)



In [30]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="hot", data=df)
plt.xticks(rotation=45)









    Out[30]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
 <a list of 29 Text xticklabel objects>)



In [31]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_failed_logins", data=df)
plt.xticks(rotation=45)









    Out[31]:





(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)



In [32]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="logged_in", data=df)
plt.xticks(rotation=45)









    Out[32]:





(array([0, 1]), <a list of 2 Text xticklabel objects>)



In [33]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_compromised", data=df)
plt.xticks(rotation=45)









    Out[33]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]),
 <a list of 96 Text xticklabel objects>)



In [34]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="root_shell", data=df)
plt.xticks(rotation=45)









    Out[34]:





(array([0, 1]), <a list of 2 Text xticklabel objects>)



In [35]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="su_attempted", data=df)
plt.xticks(rotation=45)









    Out[35]:





(array([0, 1, 2]), <a list of 3 Text xticklabel objects>)



In [36]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="num_root", data=df)
plt.xticks(rotation=45)









    Out[36]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90]), <a list of 91 Text xticklabel objects>)



In [37]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="is_host_login", data=df)
plt.xticks(rotation=45)









    Out[37]:





(array([0, 1]), <a list of 2 Text xticklabel objects>)



In [38]:

    
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="is_guest_login", data=df)
plt.xticks(rotation=45)









    Out[38]:





(array([0, 1]), <a list of 2 Text xticklabel objects>)



In [39]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 148517 entries, 0 to 22543
Data columns (total 42 columns):
duration                       148517 non-null int64
protocol                       148517 non-null object
service                        148517 non-null object
flag                           148517 non-null object
src_bytes                      148517 non-null int64
dst_bytes                      148517 non-null int64
land                           148517 non-null int64
wrong_fragment                 148517 non-null int64
urgent                         148517 non-null int64
hot                            148517 non-null int64
num_failed_logins              148517 non-null int64
logged_in                      148517 non-null int64
num_compromised                148517 non-null int64
root_shell                     148517 non-null int64
su_attempted                   148517 non-null int64
num_root                       148517 non-null int64
num_file_creations             148517 non-null int64
num_shells                     148517 non-null int64
num_access_files               148517 non-null int64
num_outbound_cmds              148517 non-null int64
is_host_login                  148517 non-null int64
is_guest_login                 148517 non-null int64
count                          148517 non-null int64
srv_count                      148517 non-null int64
serror_rate                    148517 non-null float64
srv_serror_rate                148517 non-null float64
rerror_rate                    148517 non-null float64
srv_rerror_rate                148517 non-null float64
same_srv_rate                  148517 non-null float64
diff_srv_rate                  148517 non-null float64
srv_diff_host_rate             148517 non-null float64
dst_host_count                 148517 non-null int64
dst_host_srv_count             148517 non-null int64
dst_host_same_srv_rate         148517 non-null float64
dst_host_diff_srv_rate         148517 non-null float64
dst_host_same_src_port_rate    148517 non-null float64
dst_host_srv_diff_host_rate    148517 non-null float64
dst_host_serror_rate           148517 non-null float64
dst_host_srv_serror_rate       148517 non-null float64
dst_host_rerror_rate           148517 non-null float64
dst_host_srv_rerror_rate       148517 non-null float64
other                          148517 non-null int64
dtypes: float64(15), int64(24), object(3)
memory usage: 53.7+ MB



In [ ]:

    
cat_df = df[['protocol', 'service', 'flag']]
cat_df.head()



In [45]:

    
## Convert Categorical Features to Numbers



In [48]:

    
from sklearn.preprocessing import OneHotEncoder



In [49]:

    
encoder = OneHotEncoder()  # so ML Classifiers willt treat all values the same



In [ ]:

	protocol	service	flag	src_bytes	dst_bytes	...	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	attack_type	other
0	tcp	ftp_data	SF	491	0	...	0.17	0.03	0.17	0.00	0.00	0.00	0.05	0.00	normal	20
1	udp	other	SF	146	0	...	0.00	0.60	0.88	0.00	0.00	0.00	0.00	0.00	normal	15
2	tcp	private	S0	0	0	...	0.10	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune	19
3	tcp	http	SF	232	8153	...	1.00	0.00	0.03	0.04	0.03	0.01	0.00	0.01	normal	21
4	tcp	http	SF	199	420	...	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	normal	21

	duration	protocol	service	flag	src_bytes	dst_bytes	...	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	attack_type	other
0	0	tcp	private	REJ	0	0	...	0.04	0.06	0.00	0.00	1.00	1.00	neptune	21
1	0	tcp	private	REJ	0	0	...	0.00	0.06	0.00	0.00	1.00	1.00	neptune	21
2	2	tcp	ftp_data	SF	12983	0	...	0.61	0.04	0.61	0.02	0.00	0.00	normal	21
3	0	icmp	eco_i	SF	20	0	...	1.00	0.00	1.00	0.28	0.00	0.00	saint	15
4	1	tcp	telnet	RSTO	0	15	...	0.31	0.17	0.03	0.02	0.83	0.71	mscan	11

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	other
count	148517.000000	1.485170e+05	1.485170e+05	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	...	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000	148517.000000
mean	276.779305	4.022795e+04	1.708885e+04	0.000215	0.020523	0.000202	0.189379	0.004323	0.402789	0.255062	...	119.462661	0.534521	0.084103	0.145932	0.030584	0.256122	0.251304	0.136220	0.136397	19.278480
std	2460.683131	5.409612e+06	3.703525e+06	0.014677	0.240069	0.019417	2.013160	0.072248	0.490461	22.231375	...	111.232318	0.448061	0.194102	0.308638	0.108975	0.428500	0.429719	0.322741	0.335282	2.739757
min	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	11.000000	0.050000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	18.000000
50%	0.000000	4.400000e+01	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	72.000000	0.600000	0.020000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	20.000000
75%	0.000000	2.780000e+02	5.710000e+02	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	255.000000	1.000000	0.070000	0.050000	0.010000	0.600000	0.500000	0.000000	0.000000	21.000000
max	57715.000000	1.379964e+09	1.309937e+09	1.000000	3.000000	3.000000	101.000000	5.000000	1.000000	7479.000000	...	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	21.000000