Read Data Sample



In [2]:

    
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline



In [3]:

    
class dataset:
    col_names = ["duration","protocol_type","service","flag","src_bytes",
        "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
        "logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
        "is_host_login","is_guest_login","count","srv_count","serror_rate",
        "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
        "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
        "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
        "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

    kdd_train = pd.read_csv("dataset/KDDTrain+_20Percent.txt",names = col_names,)
    kdd_test = pd.read_csv("dataset/KDDTest-21.txt",names = col_names,)

    kdd_diff_level_train = kdd_train["difficulty_level"].copy()
    kdd_diff_level_test = kdd_test["difficulty_level"].copy()
    
    kdd_train = kdd_train.drop("difficulty_level", axis = 1)
    kdd_test = kdd_test.drop("difficulty_level", axis = 1)
    
    kdd_train.to_csv("dataset/KDDTrain+_20Percent.csv")
    kdd_test.to_csv("dataset/KDDTest-21.csv")



In [4]:

    
category_variables = ["protocol_type","service","flag"]
for cv in category_variables:
    dataset.kdd_train[cv] = dataset.kdd_train[cv].astype("category")
    dataset.kdd_test[cv] = dataset.kdd_test[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    print("Length of Categories for {} are {}".format(cv , len(dataset.kdd_train[cv].cat.categories)))
    print("Categories for {} are {} \n".format(cv ,dataset.kdd_train[cv].cat.categories))









    



Length of Categories for protocol_type are 3
Categories for protocol_type are Index(['icmp', 'tcp', 'udp'], dtype='object') 

Length of Categories for service are 66
Categories for service are Index(['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf',
       'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i',
       'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'hostnames',
       'http', 'http_443', 'http_8001', 'imap4', 'iso_tsap', 'klogin',
       'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm',
       'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u',
       'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i',
       'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc',
       'supdup', 'systat', 'telnet', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp',
       'uucp_path', 'vmnet', 'whois'],
      dtype='object') 

Length of Categories for flag are 11
Categories for flag are Index(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF',
       'SH'],
      dtype='object')



In [4]:

    
dataset.kdd_train









    Out[4]:






  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      label
    
  
  
    
      0
      0
      tcp
      ftp_data
      SF
      491
      0
      0
      0
      0
      0
      ...
      25
      0.17
      0.03
      0.17
      0.00
      0.00
      0.00
      0.05
      0.00
      normal
    
    
      1
      0
      udp
      other
      SF
      146
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.60
      0.88
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      2
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      26
      0.10
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      3
      0
      tcp
      http
      SF
      232
      8153
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.03
      0.04
      0.03
      0.01
      0.00
      0.01
      normal
    
    
      4
      0
      tcp
      http
      SF
      199
      420
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      5
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      19
      0.07
      0.07
      0.00
      0.00
      0.00
      0.00
      1.00
      1.00
      neptune
    
    
      6
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      9
      0.04
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      25185
      1
      tcp
      smtp
      SF
      2896
      333
      0
      0
      0
      0
      ...
      11
      0.92
      0.17
      0.08
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      25186
      0
      tcp
      http
      S1
      339
      14600
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.01
      0.01
      0.01
      0.00
      0.01
      0.00
      normal
    
    
      25187
      0
      tcp
      exec
      RSTO
      0
      0
      0
      0
      0
      0
      ...
      7
      0.03
      0.06
      0.00
      0.00
      0.00
      0.00
      1.00
      1.00
      neptune
    
    
      25188
      0
      tcp
      ftp_data
      SF
      334
      0
      0
      0
      0
      0
      ...
      39
      1.00
      0.00
      1.00
      0.18
      0.00
      0.00
      0.00
      0.00
      warezclient
    
    
      25189
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      13
      0.05
      0.07
      0.00
      0.00
      0.00
      0.00
      1.00
      1.00
      neptune
    
    
      25190
      0
      tcp
      nnsp
      S0
      0
      0
      0
      0
      0
      0
      ...
      20
      0.08
      0.06
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      25191
      0
      tcp
      finger
      S0
      0
      0
      0
      0
      0
      0
      ...
      49
      0.19
      0.03
      0.01
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
  

25192 rows × 42 columns



In [5]:

    
dataset.kdd_test









    Out[5]:






  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      label
    
  
  
    
      0
      13
      tcp
      telnet
      SF
      118
      2425
      0
      0
      0
      0
      ...
      10
      0.38
      0.12
      0.04
      0.00
      0.00
      0.00
      0.12
      0.30
      guess_passwd
    
    
      1
      0
      udp
      private
      SF
      44
      0
      0
      0
      0
      0
      ...
      254
      1.00
      0.01
      0.01
      0.00
      0.00
      0.00
      0.00
      0.00
      snmpguess
    
    
      2
      0
      tcp
      telnet
      S3
      0
      44
      0
      0
      0
      0
      ...
      79
      0.31
      0.61
      0.00
      0.00
      0.21
      0.68
      0.60
      0.00
      processtable
    
    
      3
      0
      udp
      private
      SF
      53
      55
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.87
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      4
      0
      tcp
      private
      SH
      0
      0
      0
      0
      0
      0
      ...
      1
      0.06
      1.00
      1.00
      0.00
      1.00
      1.00
      0.00
      0.00
      nmap
    
    
      5
      0
      tcp
      http
      SF
      54540
      8314
      0
      0
      0
      2
      ...
      229
      0.90
      0.01
      0.00
      0.00
      0.00
      0.00
      0.01
      0.00
      back
    
    
      6
      0
      tcp
      imap4
      REJ
      0
      0
      0
      0
      0
      0
      ...
      9
      0.04
      0.07
      0.00
      0.00
      0.00
      0.00
      1.00
      1.00
      neptune
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      11843
      0
      tcp
      other
      REJ
      0
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.90
      0.00
      0.00
      0.15
      0.00
      0.85
      1.00
      satan
    
    
      11844
      5
      tcp
      pop_3
      SF
      28
      93
      0
      0
      0
      0
      ...
      25
      1.00
      0.00
      0.04
      0.08
      0.00
      0.00
      0.00
      0.00
      guess_passwd
    
    
      11845
      0
      udp
      domain_u
      SF
      43
      43
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.01
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      11846
      0
      tcp
      http
      SF
      336
      285
      0
      0
      0
      0
      ...
      234
      0.92
      0.02
      0.00
      0.00
      0.00
      0.00
      0.05
      0.00
      normal
    
    
      11847
      1
      tcp
      telnet
      RSTO
      0
      15
      0
      0
      0
      0
      ...
      96
      0.37
      0.03
      0.01
      0.02
      0.05
      0.08
      0.85
      0.58
      mscan
    
    
      11848
      0
      tcp
      sunrpc
      REJ
      0
      0
      0
      0
      0
      0
      ...
      52
      0.19
      0.03
      0.01
      0.04
      0.00
      0.00
      0.88
      1.00
      mscan
    
    
      11849
      0
      udp
      private
      SF
      41
      0
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      snmpguess
    
  

11850 rows × 42 columns



In [6]:

    
dataset.kdd_train.describe()









    Out[6]:






  
    
      
      duration
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      num_failed_logins
      logged_in
      num_compromised
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
    
  
  
    
      count
      25192.000000
      2.519200e+04
      2.519200e+04
      25192.000000
      25192.000000
      25192.00000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      ...
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
      25192.000000
    
    
      mean
      305.054104
      2.433063e+04
      3.491847e+03
      0.000079
      0.023738
      0.00004
      0.198039
      0.001191
      0.394768
      0.227850
      ...
      182.532074
      115.063036
      0.519791
      0.082539
      0.147453
      0.031844
      0.285800
      0.279846
      0.117800
      0.118769
    
    
      std
      2686.555640
      2.410805e+06
      8.883072e+04
      0.008910
      0.260221
      0.00630
      2.154202
      0.045418
      0.488811
      10.417352
      ...
      98.993895
      110.646850
      0.448944
      0.187191
      0.308367
      0.110575
      0.445316
      0.446075
      0.305869
      0.317333
    
    
      min
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      84.000000
      10.000000
      0.050000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      4.400000e+01
      0.000000e+00
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      255.000000
      61.000000
      0.510000
      0.030000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.000000
      2.790000e+02
      5.302500e+02
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      255.000000
      255.000000
      1.000000
      0.070000
      0.060000
      0.020000
      1.000000
      1.000000
      0.000000
      0.000000
    
    
      max
      42862.000000
      3.817091e+08
      5.151385e+06
      1.000000
      3.000000
      1.00000
      77.000000
      4.000000
      1.000000
      884.000000
      ...
      255.000000
      255.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 38 columns



In [7]:

    
dataset.kdd_test.describe()









    Out[7]:






  
    
      
      duration
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      num_failed_logins
      logged_in
      num_compromised
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
    
  
  
    
      count
      11850.000000
      1.185000e+04
      1.185000e+04
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      ...
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
      11850.000000
    
    
      mean
      415.439831
      1.945644e+04
      1.228105e+03
      0.000591
      0.016034
      0.001350
      0.190211
      0.041181
      0.253080
      0.227679
      ...
      213.945401
      117.541603
      0.539597
      0.148986
      0.206493
      0.024465
      0.104181
      0.106697
      0.263952
      0.252209
    
    
      std
      1919.441623
      6.519865e+05
      2.389603e+04
      0.024299
      0.196379
      0.050299
      1.222392
      0.205401
      0.434795
      10.025774
      ...
      82.203728
      106.563438
      0.423370
      0.288758
      0.379193
      0.114234
      0.265086
      0.278549
      0.377337
      0.403039
    
    
      min
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      1.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      255.000000
      12.000000
      0.070000
      0.010000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      2.800000e+01
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      255.000000
      83.000000
      0.530000
      0.020000
      0.000000
      0.000000
      0.000000
      0.000000
      0.020000
      0.000000
    
    
      75%
      1.000000
      1.600000e+02
      9.300000e+01
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      255.000000
      250.000000
      1.000000
      0.070000
      0.090000
      0.000000
      0.010000
      0.000000
      0.560000
      0.520000
    
    
      max
      57715.000000
      6.282565e+07
      1.288652e+06
      1.000000
      3.000000
      3.000000
      101.000000
      4.000000
      1.000000
      796.000000
      ...
      255.000000
      255.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 38 columns



In [8]:

    
print("Column - Label")
print("Unique values: \n{}".format(dataset.kdd_train.label))
print("\nStatistical properties: \n{}".format(dataset.kdd_train.label.describe()))









    



Column - Label
Unique values: 
0             normal
1             normal
2            neptune
3             normal
4             normal
5            neptune
6            neptune
            ...     
25185         normal
25186         normal
25187        neptune
25188    warezclient
25189        neptune
25190        neptune
25191        neptune
Name: label, dtype: object

Statistical properties: 
count      25192
unique        22
top       normal
freq       13449
Name: label, dtype: object



In [9]:

    
attack_types = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

is_attack = {
    "DoS":"Attack",
    "R2L":"Attack",
    "U2R":"Attack",
    "Probe":"Attack",
    "normal":"Normal"
}



In [10]:

    
dataset.kdd_train["type"] = dataset.kdd_train.label.map(lambda x: attack_types[x])
dataset.kdd_train["is"] = dataset.kdd_train.type.map(lambda x: is_attack[x])

dataset.kdd_test["type"] = dataset.kdd_test.label.map(lambda x: attack_types[x])
dataset.kdd_test["is"] = dataset.kdd_test.type.map(lambda x: is_attack[x])



In [11]:

    
kdd_attack_type_group = dataset.kdd_train.groupby("type")
kdd_is_attack_group = dataset.kdd_train.groupby("is")



In [12]:

    
kdd_attack_type_group.type.count()









    Out[12]:





type
DoS        9234
Probe      2289
R2L         209
U2R          11
normal    13449
Name: type, dtype: int64



In [13]:

    
kdd_is_attack_group["is"].count()









    Out[13]:





is
Attack    11743
Normal    13449
Name: is, dtype: int64



In [14]:

    
df = dataset.kdd_train.set_index("is")
df.loc["Attack"].label.unique()









    Out[14]:





array(['neptune', 'warezclient', 'ipsweep', 'portsweep', 'teardrop',
       'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy'], dtype=object)



In [15]:

    
df.loc["Normal"].label.unique()









    Out[15]:





array(['normal'], dtype=object)



In [16]:

    
kdd_is_attack_group.hist(figsize=[25,22])









    Out[16]:





is
Attack    [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
Normal    [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
dtype: object



In [17]:

    
kdd_attack_type_group.hist(figsize=[25,22])









    Out[17]:





type
DoS       [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
Probe     [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
R2L       [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
U2R       [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
normal    [[Axes(0.125,0.794205;0.103333x0.0857955), Axe...
dtype: object



In [18]:

    
gb = dataset.kdd_diff_level_train.groupby(dataset.kdd_diff_level_train)
(gb.count() / dataset.kdd_diff_level_train.count())*100









    Out[18]:





difficulty_level
0      0.047634
1      0.067482
2      0.039695
3      0.055573
4      0.055573
5      0.055573
6      0.107177
        ...    
15     3.183550
16     1.861702
17     2.330105
18    16.441728
19     8.125595
20    15.330264
21    49.599079
Name: difficulty_level, dtype: float64



In [19]:

    
gb = dataset.kdd_diff_level_test.groupby(dataset.kdd_diff_level_test)
(gb.count() / dataset.kdd_diff_level_test.count())*100









    Out[19]:





difficulty_level
0      1.037975
1      0.734177
2      0.464135
3      0.978903
4      0.852321
5      0.869198
6      1.324895
        ...    
14     6.210970
15     9.924051
16     5.746835
17     9.856540
18    25.037975
19     7.510549
20    11.333333
Name: difficulty_level, dtype: float64



In [20]:

    
dummy_variables_2labels = [*category_variables, "is"]
dummy_variables_5labels = [*category_variables, "type"]
class preprocessing:
    kdd_train_2labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_train_5labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)

    kdd_test_2labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_test_5labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)


    kdd_train_2labels_y = dataset.kdd_train["is"].copy() # For SVM
    kdd_train_5labels_y = dataset.kdd_train["type"].copy() # For SVM

    kdd_test_2labels_y = dataset.kdd_test["is"].copy() # For SVM
    kdd_test_5labels_y = dataset.kdd_test["type"].copy() # For SVM

    kdd_train_2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_test_2labels.drop(["label", "type"], axis=1, inplace=True)

    kdd_train_5labels.drop(["label", "is"], axis=1, inplace=True)
    kdd_test_5labels.drop(["label", "is"], axis=1, inplace=True)



In [21]:

    
preprocessing.kdd_train_2labels.columns.to_series().to_csv("dataset/columns_2labels.csv")
preprocessing.kdd_train_5labels.columns.to_series().to_csv("dataset/columns_5labels.csv")



In [22]:

    
preprocessing.kdd_train_2labels.shape









    Out[22]:





(25192, 120)



In [23]:

    
preprocessing.kdd_train_5labels.shape









    Out[23]:





(25192, 123)



In [24]:

    
preprocessing.kdd_test_2labels.shape









    Out[24]:





(11850, 120)



In [25]:

    
preprocessing.kdd_test_5labels.shape









    Out[25]:





(11850, 123)



In [26]:

    
preprocessing.kdd_train_2labels_y.shape









    Out[26]:





(25192,)



In [27]:

    
preprocessing.kdd_test_2labels_y.shape









    Out[27]:





(11850,)



In [28]:

    
preprocessing.kdd_train_5labels_y.shape









    Out[28]:





(25192,)



In [29]:

    
preprocessing.kdd_test_5labels_y.shape









    Out[29]:





(11850,)



In [31]:

    
preprocessing.kdd_train_2labels.to_pickle("dataset/kdd_train_2labels_20percent.pkl")
preprocessing.kdd_train_2labels_y.to_pickle("dataset/kdd_train_2labels_y_20percent.pkl")

preprocessing.kdd_train_5labels.to_pickle("dataset/kdd_train_5labels_20percent.pkl")
preprocessing.kdd_train_5labels_y.to_pickle("dataset/kdd_train_5labels_y_20percent.pkl")

preprocessing.kdd_test_2labels.to_pickle("dataset/kdd_test_2labels_20percent.pkl")
preprocessing.kdd_test_2labels_y.to_pickle("dataset/kdd_test_2labels_y_20percent.pkl")

preprocessing.kdd_test_5labels.to_pickle("dataset/kdd_test_5labels_20percent.pkl")
preprocessing.kdd_test_5labels_y.to_pickle("dataset/kdd_test_5labels_y_20percent.pkl")


dataset.kdd_diff_level_train.to_pickle("dataset/kdd_diff_level_train_20percent.pkl")
dataset.kdd_diff_level_test.to_pickle("dataset/kdd_diff_level_test_20percent.pkl")



In [ ]:

	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	label
0	0	tcp	ftp_data	SF	491	0	0	0	0	0	...	25	0.17	0.03	0.17	0.00	0.00	0.00	0.05	0.00	normal
1	0	udp	other	SF	146	0	0	0	0	0	...	1	0.00	0.60	0.88	0.00	0.00	0.00	0.00	0.00	normal
2	0	tcp	private	S0	0	0	0	0	0	0	...	26	0.10	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
3	0	tcp	http	SF	232	8153	0	0	0	0	...	255	1.00	0.00	0.03	0.04	0.03	0.01	0.00	0.01	normal
4	0	tcp	http	SF	199	420	0	0	0	0	...	255	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	normal
5	0	tcp	private	REJ	0	0	0	0	0	0	...	19	0.07	0.07	0.00	0.00	0.00	0.00	1.00	1.00	neptune
6	0	tcp	private	S0	0	0	0	0	0	0	...	9	0.04	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25185	1	tcp	smtp	SF	2896	333	0	0	0	0	...	11	0.92	0.17	0.08	0.00	0.00	0.00	0.00	0.00	normal
25186	0	tcp	http	S1	339	14600	0	0	0	0	...	255	1.00	0.00	0.01	0.01	0.01	0.00	0.01	0.00	normal
25187	0	tcp	exec	RSTO	0	0	0	0	0	0	...	7	0.03	0.06	0.00	0.00	0.00	0.00	1.00	1.00	neptune
25188	0	tcp	ftp_data	SF	334	0	0	0	0	0	...	39	1.00	0.00	1.00	0.18	0.00	0.00	0.00	0.00	warezclient
25189	0	tcp	private	REJ	0	0	0	0	0	0	...	13	0.05	0.07	0.00	0.00	0.00	0.00	1.00	1.00	neptune
25190	0	tcp	nnsp	S0	0	0	0	0	0	0	...	20	0.08	0.06	0.00	0.00	1.00	1.00	0.00	0.00	neptune
25191	0	tcp	finger	S0	0	0	0	0	0	0	...	49	0.19	0.03	0.01	0.00	1.00	1.00	0.00	0.00	neptune

	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	label
0	13	tcp	telnet	SF	118	2425	0	0	0	0	...	10	0.38	0.12	0.04	0.00	0.00	0.00	0.12	0.30	guess_passwd
1	0	udp	private	SF	44	0	0	0	0	0	...	254	1.00	0.01	0.01	0.00	0.00	0.00	0.00	0.00	snmpguess
2	0	tcp	telnet	S3	0	44	0	0	0	0	...	79	0.31	0.61	0.00	0.00	0.21	0.68	0.60	0.00	processtable
3	0	udp	private	SF	53	55	0	0	0	0	...	255	1.00	0.00	0.87	0.00	0.00	0.00	0.00	0.00	normal
4	0	tcp	private	SH	0	0	0	0	0	0	...	1	0.06	1.00	1.00	0.00	1.00	1.00	0.00	0.00	nmap
5	0	tcp	http	SF	54540	8314	0	0	0	2	...	229	0.90	0.01	0.00	0.00	0.00	0.00	0.01	0.00	back
6	0	tcp	imap4	REJ	0	0	0	0	0	0	...	9	0.04	0.07	0.00	0.00	0.00	0.00	1.00	1.00	neptune
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11843	0	tcp	other	REJ	0	0	0	0	0	0	...	1	0.00	0.90	0.00	0.00	0.15	0.00	0.85	1.00	satan
11844	5	tcp	pop_3	SF	28	93	0	0	0	0	...	25	1.00	0.00	0.04	0.08	0.00	0.00	0.00	0.00	guess_passwd
11845	0	udp	domain_u	SF	43	43	0	0	0	0	...	255	1.00	0.00	0.01	0.00	0.00	0.00	0.00	0.00	normal
11846	0	tcp	http	SF	336	285	0	0	0	0	...	234	0.92	0.02	0.00	0.00	0.00	0.00	0.05	0.00	normal
11847	1	tcp	telnet	RSTO	0	15	0	0	0	0	...	96	0.37	0.03	0.01	0.02	0.05	0.08	0.85	0.58	mscan
11848	0	tcp	sunrpc	REJ	0	0	0	0	0	0	...	52	0.19	0.03	0.01	0.04	0.00	0.00	0.88	1.00	mscan
11849	0	udp	private	SF	41	0	0	0	0	0	...	255	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	snmpguess

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	25192.000000	2.519200e+04	2.519200e+04	25192.000000	25192.000000	25192.00000	25192.000000	25192.000000	25192.000000	25192.000000	...	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000	25192.000000
mean	305.054104	2.433063e+04	3.491847e+03	0.000079	0.023738	0.00004	0.198039	0.001191	0.394768	0.227850	...	182.532074	115.063036	0.519791	0.082539	0.147453	0.031844	0.285800	0.279846	0.117800	0.118769
std	2686.555640	2.410805e+06	8.883072e+04	0.008910	0.260221	0.00630	2.154202	0.045418	0.488811	10.417352	...	98.993895	110.646850	0.448944	0.187191	0.308367	0.110575	0.445316	0.446075	0.305869	0.317333
min	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	...	84.000000	10.000000	0.050000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	4.400000e+01	0.000000e+00	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	...	255.000000	61.000000	0.510000	0.030000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	0.000000	2.790000e+02	5.302500e+02	0.000000	0.000000	0.00000	0.000000	0.000000	1.000000	0.000000	...	255.000000	255.000000	1.000000	0.070000	0.060000	0.020000	1.000000	1.000000	0.000000	0.000000
max	42862.000000	3.817091e+08	5.151385e+06	1.000000	3.000000	1.00000	77.000000	4.000000	1.000000	884.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	11850.000000	1.185000e+04	1.185000e+04	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	...	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000	11850.000000
mean	415.439831	1.945644e+04	1.228105e+03	0.000591	0.016034	0.001350	0.190211	0.041181	0.253080	0.227679	...	213.945401	117.541603	0.539597	0.148986	0.206493	0.024465	0.104181	0.106697	0.263952	0.252209
std	1919.441623	6.519865e+05	2.389603e+04	0.024299	0.196379	0.050299	1.222392	0.205401	0.434795	10.025774	...	82.203728	106.563438	0.423370	0.288758	0.379193	0.114234	0.265086	0.278549	0.377337	0.403039
min	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	255.000000	12.000000	0.070000	0.010000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	2.800000e+01	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	255.000000	83.000000	0.530000	0.020000	0.000000	0.000000	0.000000	0.000000	0.020000	0.000000
75%	1.000000	1.600000e+02	9.300000e+01	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	255.000000	250.000000	1.000000	0.070000	0.090000	0.000000	0.010000	0.000000	0.560000	0.520000
max	57715.000000	6.282565e+07	1.288652e+06	1.000000	3.000000	3.000000	101.000000	4.000000	1.000000	796.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000