Read Data Sample



In [3]:

    
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline



In [4]:

    
class dataset:
    col_names = ["duration","protocol_type","service","flag","src_bytes",
        "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
        "logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
        "is_host_login","is_guest_login","count","srv_count","serror_rate",
        "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
        "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
        "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
        "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

    kdd_train = pd.read_csv("dataset/KDDTrain+.txt",names = col_names,)
    kdd_test = pd.read_csv("dataset/KDDTest+.txt",names = col_names,)
    
    kdd_train_ = pd.read_csv("dataset/KDDTrain+_20Percent.txt",names = col_names,)
    kdd_test_ = pd.read_csv("dataset/KDDTest-21.txt",names = col_names,)
    

    kdd_diff_level_train = kdd_train["difficulty_level"].copy()
    kdd_diff_level_test = kdd_test["difficulty_level"].copy()
    
    kdd_train = kdd_train.drop("difficulty_level", axis = 1)
    kdd_test = kdd_test.drop("difficulty_level", axis = 1)
    
    kdd_train_ = kdd_train_.drop("difficulty_level", axis = 1) #labels ['difficulty_level'] not contained in axis

    kdd_test_ = kdd_test_.drop("difficulty_level", axis = 1)
    
    kdd_train.to_csv("dataset/KDDTrain+.csv")
    kdd_test.to_csv("dataset/KDDTest+.csv")
    
    kdd_train_.to_csv("dataset/KDDTrain_.csv")
    kdd_test_.to_csv("dataset/KDDTest_.csv")



In [5]:

    
category_variables = ["protocol_type","service","flag"]
for cv in category_variables:
    dataset.kdd_train[cv] = dataset.kdd_train[cv].astype("category")
    dataset.kdd_test[cv] = dataset.kdd_test[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    
    dataset.kdd_train_[cv] = dataset.kdd_train_[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    dataset.kdd_test_[cv] = dataset.kdd_test_[cv].astype("category", 
                                                       categories = dataset.kdd_train[cv].cat.categories)
    
    print("Length of Categories for {} are {}".format(cv , len(dataset.kdd_train[cv].cat.categories)))
    print("Categories for {} are {} \n".format(cv ,dataset.kdd_train[cv].cat.categories))









    



Length of Categories for protocol_type are 3
Categories for protocol_type are Index(['icmp', 'tcp', 'udp'], dtype='object') 

Length of Categories for service are 70
Categories for service are Index(['IRC', 'X11', 'Z39_50', 'aol', 'auth', 'bgp', 'courier', 'csnet_ns',
       'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i',
       'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
       'harvest', 'hostnames', 'http', 'http_2784', 'http_443', 'http_8001',
       'imap4', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp',
       'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp',
       'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer',
       'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net',
       'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i',
       'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois'],
      dtype='object') 

Length of Categories for flag are 11
Categories for flag are Index(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF',
       'SH'],
      dtype='object') 







    



/home/apcloud44/miniconda3/envs/p3/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  """
/home/apcloud44/miniconda3/envs/p3/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  
/home/apcloud44/miniconda3/envs/p3/lib/python3.6/site-packages/ipykernel_launcher.py:10: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  # Remove the CWD from sys.path while we load stuff.



In [6]:

    
dataset.kdd_train









    Out[6]:







  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      label
    
  
  
    
      0
      0
      tcp
      ftp_data
      SF
      491
      0
      0
      0
      0
      0
      ...
      25
      0.17
      0.03
      0.17
      0.00
      0.00
      0.00
      0.05
      0.00
      normal
    
    
      1
      0
      udp
      other
      SF
      146
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.60
      0.88
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      2
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      26
      0.10
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      3
      0
      tcp
      http
      SF
      232
      8153
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.03
      0.04
      0.03
      0.01
      0.00
      0.01
      normal
    
    
      4
      0
      tcp
      http
      SF
      199
      420
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      5
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      19
      0.07
      0.07
      0.00
      0.00
      0.00
      0.00
      1.00
      1.00
      neptune
    
    
      6
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      9
      0.04
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      125966
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      13
      0.05
      0.07
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      125967
      0
      tcp
      http
      SF
      359
      375
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.33
      0.04
      0.33
      0.00
      0.00
      0.00
      normal
    
    
      125968
      0
      tcp
      private
      S0
      0
      0
      0
      0
      0
      0
      ...
      25
      0.10
      0.06
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      125969
      8
      udp
      private
      SF
      105
      145
      0
      0
      0
      0
      ...
      244
      0.96
      0.01
      0.01
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
    
      125970
      0
      tcp
      smtp
      SF
      2231
      384
      0
      0
      0
      0
      ...
      30
      0.12
      0.06
      0.00
      0.00
      0.72
      0.00
      0.01
      0.00
      normal
    
    
      125971
      0
      tcp
      klogin
      S0
      0
      0
      0
      0
      0
      0
      ...
      8
      0.03
      0.05
      0.00
      0.00
      1.00
      1.00
      0.00
      0.00
      neptune
    
    
      125972
      0
      tcp
      ftp_data
      SF
      151
      0
      0
      0
      0
      0
      ...
      77
      0.30
      0.03
      0.30
      0.00
      0.00
      0.00
      0.00
      0.00
      normal
    
  

125973 rows × 42 columns



In [7]:

    
dataset.kdd_test









    Out[7]:







  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      label
    
  
  
    
      0
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      10
      0.04
      0.06
      0.00
      0.00
      0.00
      0.0
      1.00
      1.00
      neptune
    
    
      1
      0
      tcp
      private
      REJ
      0
      0
      0
      0
      0
      0
      ...
      1
      0.00
      0.06
      0.00
      0.00
      0.00
      0.0
      1.00
      1.00
      neptune
    
    
      2
      2
      tcp
      ftp_data
      SF
      12983
      0
      0
      0
      0
      0
      ...
      86
      0.61
      0.04
      0.61
      0.02
      0.00
      0.0
      0.00
      0.00
      normal
    
    
      3
      0
      icmp
      eco_i
      SF
      20
      0
      0
      0
      0
      0
      ...
      57
      1.00
      0.00
      1.00
      0.28
      0.00
      0.0
      0.00
      0.00
      saint
    
    
      4
      1
      tcp
      telnet
      RSTO
      0
      15
      0
      0
      0
      0
      ...
      86
      0.31
      0.17
      0.03
      0.02
      0.00
      0.0
      0.83
      0.71
      mscan
    
    
      5
      0
      tcp
      http
      SF
      267
      14515
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.01
      0.03
      0.01
      0.0
      0.00
      0.00
      normal
    
    
      6
      0
      tcp
      smtp
      SF
      1022
      387
      0
      0
      0
      0
      ...
      28
      0.11
      0.72
      0.00
      0.00
      0.00
      0.0
      0.72
      0.04
      normal
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      22537
      1
      tcp
      smtp
      SF
      2599
      293
      0
      0
      0
      0
      ...
      186
      0.73
      0.13
      0.00
      0.00
      0.00
      0.0
      0.26
      0.00
      mailbomb
    
    
      22538
      0
      icmp
      ecr_i
      SF
      1032
      0
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      1.00
      0.00
      0.00
      0.0
      0.00
      0.00
      smurf
    
    
      22539
      0
      tcp
      smtp
      SF
      794
      333
      0
      0
      0
      0
      ...
      141
      0.72
      0.06
      0.01
      0.01
      0.01
      0.0
      0.00
      0.00
      normal
    
    
      22540
      0
      tcp
      http
      SF
      317
      938
      0
      0
      0
      0
      ...
      255
      1.00
      0.00
      0.01
      0.01
      0.01
      0.0
      0.00
      0.00
      normal
    
    
      22541
      0
      tcp
      http
      SF
      54540
      8314
      0
      0
      0
      2
      ...
      255
      1.00
      0.00
      0.00
      0.00
      0.00
      0.0
      0.07
      0.07
      back
    
    
      22542
      0
      udp
      domain_u
      SF
      42
      42
      0
      0
      0
      0
      ...
      252
      0.99
      0.01
      0.00
      0.00
      0.00
      0.0
      0.00
      0.00
      normal
    
    
      22543
      0
      tcp
      sunrpc
      REJ
      0
      0
      0
      0
      0
      0
      ...
      21
      0.08
      0.03
      0.00
      0.00
      0.00
      0.0
      0.44
      1.00
      mscan
    
  

22544 rows × 42 columns



In [8]:

    
dataset.kdd_train.describe()









    Out[8]:







  
    
      
      duration
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      num_failed_logins
      logged_in
      num_compromised
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
    
  
  
    
      count
      125973.00000
      1.259730e+05
      1.259730e+05
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      ...
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
      125973.000000
    
    
      mean
      287.14465
      4.556674e+04
      1.977911e+04
      0.000198
      0.022687
      0.000111
      0.204409
      0.001222
      0.395736
      0.279250
      ...
      182.148945
      115.653005
      0.521242
      0.082951
      0.148379
      0.032542
      0.284452
      0.278485
      0.118832
      0.120240
    
    
      std
      2604.51531
      5.870331e+06
      4.021269e+06
      0.014086
      0.253530
      0.014366
      2.149968
      0.045239
      0.489010
      23.942042
      ...
      99.206213
      110.702741
      0.448949
      0.188922
      0.308997
      0.112564
      0.444784
      0.445669
      0.306557
      0.319459
    
    
      min
      0.00000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.00000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      82.000000
      10.000000
      0.050000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.00000
      4.400000e+01
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      255.000000
      63.000000
      0.510000
      0.020000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.00000
      2.760000e+02
      5.160000e+02
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      255.000000
      255.000000
      1.000000
      0.070000
      0.060000
      0.020000
      1.000000
      1.000000
      0.000000
      0.000000
    
    
      max
      42908.00000
      1.379964e+09
      1.309937e+09
      1.000000
      3.000000
      3.000000
      77.000000
      5.000000
      1.000000
      7479.000000
      ...
      255.000000
      255.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 38 columns

Zero Data Points



In [9]:

    
a = dataset.kdd_train.isin([0])
a.sum().sum() / a.size









    Out[9]:





0.6318702836170865



In [10]:

    
dataset.kdd_test.describe()









    Out[10]:







  
    
      
      duration
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      num_failed_logins
      logged_in
      num_compromised
      ...
      dst_host_count
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
    
  
  
    
      count
      22544.000000
      2.254400e+04
      2.254400e+04
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      ...
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
      22544.000000
    
    
      mean
      218.859076
      1.039545e+04
      2.056019e+03
      0.000311
      0.008428
      0.000710
      0.105394
      0.021647
      0.442202
      0.119899
      ...
      193.869411
      140.750532
      0.608722
      0.090540
      0.132261
      0.019638
      0.097814
      0.099426
      0.233385
      0.226683
    
    
      std
      1407.176612
      4.727864e+05
      2.121930e+04
      0.017619
      0.142599
      0.036473
      0.928428
      0.150328
      0.496659
      7.269597
      ...
      94.035663
      111.783972
      0.435688
      0.220717
      0.306268
      0.085394
      0.273139
      0.281866
      0.387229
      0.400875
    
    
      min
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000e+00
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      121.000000
      15.000000
      0.070000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      5.400000e+01
      4.600000e+01
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      255.000000
      168.000000
      0.920000
      0.010000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.000000
      2.870000e+02
      6.010000e+02
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      255.000000
      255.000000
      1.000000
      0.060000
      0.030000
      0.010000
      0.000000
      0.000000
      0.360000
      0.170000
    
    
      max
      57715.000000
      6.282565e+07
      1.345927e+06
      1.000000
      3.000000
      3.000000
      101.000000
      4.000000
      1.000000
      796.000000
      ...
      255.000000
      255.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 38 columns



In [11]:

    
print("Column - Label")
print("Unique values: \n{}".format(dataset.kdd_train.label))
print("\nStatistical properties: \n{}".format(dataset.kdd_train.label.describe()))









    



Column - Label
Unique values: 
0          normal
1          normal
2         neptune
3          normal
4          normal
5         neptune
6         neptune
           ...   
125966    neptune
125967     normal
125968    neptune
125969     normal
125970     normal
125971    neptune
125972     normal
Name: label, Length: 125973, dtype: object

Statistical properties: 
count     125973
unique        23
top       normal
freq       67343
Name: label, dtype: object



In [12]:

    
attack_types = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

is_attack = {
    "DoS":"Attack",
    "R2L":"Attack",
    "U2R":"Attack",
    "Probe":"Attack",
    "normal":"Normal"
}



In [13]:

    
dataset.kdd_train["type"] = dataset.kdd_train.label.map(lambda x: attack_types[x])
dataset.kdd_train["is"] = dataset.kdd_train.type.map(lambda x: is_attack[x])

dataset.kdd_test["type"] = dataset.kdd_test.label.map(lambda x: attack_types[x])
dataset.kdd_test["is"] = dataset.kdd_test.type.map(lambda x: is_attack[x])

dataset.kdd_train_["type"] = dataset.kdd_train_.label.map(lambda x: attack_types[x])
dataset.kdd_train_["is"] = dataset.kdd_train_.type.map(lambda x: is_attack[x])

dataset.kdd_test_["type"] = dataset.kdd_test_.label.map(lambda x: attack_types[x])
dataset.kdd_test_["is"] = dataset.kdd_test_.type.map(lambda x: is_attack[x])



In [14]:

    
a = dataset.kdd_train.set_index("is")

print(a.loc["Normal"].isin([0]).sum().sum())
print(a.loc["Normal"].size)


a.loc["Normal"].isin([0]).sum().sum() / a.loc["Normal"].size









    



1804888
2895749






    Out[14]:





0.6232888278645697



In [15]:

    
a = dataset.kdd_train.set_index("is")

print(a.loc["Attack"].isin([0]).sum().sum())
print(a.loc["Attack"].size)


a.loc["Attack"].isin([0]).sum().sum() / a.loc["Attack"].size









    



1538253
2521090






    Out[15]:





0.610153941350765



In [16]:

    
1804888 / (1804888 + 1538253)









    Out[16]:





0.5398779172042101



In [17]:

    
kdd_attack_type_group = dataset.kdd_train.groupby("type")
kdd_is_attack_group = dataset.kdd_train.groupby("is")



In [18]:

    
kdd_attack_type_group.type.count()









    Out[18]:





type
DoS       45927
Probe     11656
R2L         995
U2R          52
normal    67343
Name: type, dtype: int64



In [19]:

    
kdd_is_attack_group["is"].count()









    Out[19]:





is
Attack    58630
Normal    67343
Name: is, dtype: int64



In [20]:

    
kdd_attack_type_group









    Out[20]:





<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fd2b2eda1d0>



In [21]:

    
df = dataset.kdd_train.set_index("is")
df.loc["Attack"].label.unique()









    Out[21]:





array(['neptune', 'warezclient', 'ipsweep', 'portsweep', 'teardrop',
       'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl'],
      dtype=object)



In [22]:

    
df.loc["Normal"].label.unique()









    Out[22]:





array(['normal'], dtype=object)



In [23]:

    
#kdd_is_attack_group.hist(figsize=[25,22])



In [24]:

    
#kdd_attack_type_group.hist(figsize=[25,22])



In [25]:

    
gb = dataset.kdd_diff_level_train.groupby(dataset.kdd_diff_level_train)
(gb.count() / dataset.kdd_diff_level_train.count())*100









    Out[25]:





difficulty_level
0      0.052392
1      0.049217
2      0.042866
3      0.051598
4      0.062712
5      0.064299
6      0.076207
        ...    
15     3.167345
16     1.899613
17     2.440205
18    16.405897
19     8.163654
20    15.351702
21    49.659054
Name: difficulty_level, Length: 22, dtype: float64



In [26]:

    
gb = dataset.kdd_diff_level_test.groupby(dataset.kdd_diff_level_test)
(gb.count() / dataset.kdd_diff_level_test.count())*100









    Out[26]:





difficulty_level
0      0.545600
1      0.385912
2      0.243967
3      0.514549
4      0.448013
5      0.456884
6      0.696416
        ...    
15     5.216466
16     3.020759
17     5.180979
18    13.160930
19     3.947835
20     5.957239
21    47.436125
Name: difficulty_level, Length: 22, dtype: float64



In [ ]:



In [27]:

    
dummy_variables_2labels = [*category_variables, "is"]
dummy_variables_5labels = [*category_variables, "type"]
attack_codes_2labels = {"Attack":1, "Normal":0}
attack_codes_5labels = {'DoS':1, 'normal':0, 'Probe':2, 'R2L':3, 'U2R':4}
    
class preprocessing:
    kdd_train_2labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_train_5labels = pd.get_dummies(dataset.kdd_train, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)

    kdd_test_2labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_test_5labels = pd.get_dummies(dataset.kdd_test, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)

    kdd_train__2labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_train__5labels = pd.get_dummies(dataset.kdd_train_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)
        
    kdd_test__2labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_2labels, prefix=dummy_variables_2labels)
    kdd_test__5labels = pd.get_dummies(dataset.kdd_test_, columns = dummy_variables_5labels, prefix=dummy_variables_5labels)


    kdd_train_2labels_y = dataset.kdd_train["is"].copy() # For SVM
    kdd_train_5labels_y = dataset.kdd_train["type"].copy() # For SVM

    kdd_test_2labels_y = dataset.kdd_test["is"].copy() # For SVM
    kdd_test_5labels_y = dataset.kdd_test["type"].copy() # For SVM
    
    kdd_train__2labels_y = dataset.kdd_train_["is"].copy() # For SVM
    kdd_train__5labels_y = dataset.kdd_train_["type"].copy() # For SVM
    
    kdd_test__2labels_y = dataset.kdd_test_["is"].copy() # For SVM
    kdd_test__5labels_y = dataset.kdd_test_["type"].copy() # For SVM

    kdd_train_2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_test_2labels.drop(["label", "type"], axis=1, inplace=True)
    
    kdd_train__2labels.drop(["label", "type"], axis=1, inplace=True)
    kdd_test__2labels.drop(["label", "type"], axis=1, inplace=True)

    kdd_train_5labels.drop(["label", "is"], axis=1, inplace=True)
    kdd_test_5labels.drop(["label", "is"], axis=1, inplace=True)
    
    kdd_train__5labels.drop(["label", "is"], axis=1, inplace=True)
    kdd_test__5labels.drop(["label", "is"], axis=1, inplace=True)
    
    kdd_train_2labels_y = kdd_train_2labels_y.map(lambda x: attack_codes_2labels[x])
    kdd_test_2labels_y = kdd_test_2labels_y.map(lambda x: attack_codes_2labels[x])
    
    kdd_train__2labels_y = kdd_train__2labels_y.map(lambda x: attack_codes_2labels[x])
    kdd_test__2labels_y = kdd_test__2labels_y.map(lambda x: attack_codes_2labels[x])
    
    kdd_train_5labels_y = kdd_train_5labels_y.map(lambda x: attack_codes_5labels[x])
    kdd_test_5labels_y = kdd_test_5labels_y.map(lambda x: attack_codes_5labels[x])
    
    kdd_train__5labels_y = kdd_train__5labels_y.map(lambda x: attack_codes_5labels[x])
    kdd_test__5labels_y = kdd_test__5labels_y.map(lambda x: attack_codes_5labels[x])



In [ ]:



In [28]:

    
preprocessing.kdd_train_2labels.columns.to_series().to_csv("dataset/columns_2labels.csv")
preprocessing.kdd_train_5labels.columns.to_series().to_csv("dataset/columns_5labels.csv")



In [29]:

    
preprocessing.kdd_train_2labels.columns









    Out[29]:





Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3',
       'flag_SF', 'flag_SH', 'is_Attack', 'is_Normal'],
      dtype='object', length=124)



In [30]:

    
preprocessing.kdd_train_2labels.shape









    Out[30]:





(125973, 124)



In [31]:

    
preprocessing.kdd_train_5labels.shape









    Out[31]:





(125973, 127)



In [32]:

    
preprocessing.kdd_test_2labels.shape









    Out[32]:





(22544, 124)



In [33]:

    
preprocessing.kdd_test_5labels.shape









    Out[33]:





(22544, 127)



In [34]:

    
preprocessing.kdd_train_2labels_y.shape









    Out[34]:





(125973,)



In [35]:

    
preprocessing.kdd_test_2labels_y.shape









    Out[35]:





(22544,)



In [36]:

    
preprocessing.kdd_train_5labels_y.shape









    Out[36]:





(125973,)



In [37]:

    
preprocessing.kdd_test_5labels_y.shape









    Out[37]:





(22544,)



In [38]:

    
import matplotlib
from pandas.plotting import andrews_curves
from pandas.plotting import parallel_coordinates
from sklearn import preprocessing as ps
from pandas.plotting import radviz
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')



In [39]:

    
df_train = preprocessing.kdd_train_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_test = preprocessing.kdd_test_2labels.drop(["is_Attack", "is_Normal"], axis = 1)
df_train = pd.concat([df_train, preprocessing.kdd_train_2labels_y], axis = 1)
df_test = pd.concat([df_test, preprocessing.kdd_test_2labels_y], axis = 1)



In [40]:

    
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
#np.set_printoptions(suppress=True)
#sample = df_train.sample(int(df_train.shape[0]*.1)) # 10% of total data
#sample.to_pickle("dataset/tsne_sample.pkl")
sample = pd.read_pickle("dataset/tsne_sample.pkl")



In [ ]:



In [41]:

    
x_tsne = sample.iloc[:, :-1]
y_tsne = sample.iloc[:, -1]


from sklearn.decomposition import SparsePCA
pca_analysis = SparsePCA(n_components=40)
#x_tsne_pca = pca_analysis.fit_transform(x_tsne)



In [42]:

    
#pd.DataFrame(x_tsne_pca).to_pickle("dataset/tsne_pca_df.pkl")
x_tsne_pca = pd.read_pickle("dataset/tsne_pca_df.pkl").values



In [43]:

    
x_tsne_pca_df = pd.DataFrame(x_tsne_pca)

codes_to_attack = {1:"Attack", 0:"Normal"}

y_tsne_cta = y_tsne.map(lambda x: codes_to_attack[x])
x_tsne_pca_df['is'] = y_tsne_cta.values



In [44]:

    
plt.figure(figsize=(7,3))

andrews_curves(x_tsne_pca_df, "is")









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd2b005c5c0>



In [45]:

    
#df = model.fit_transform(x_tsne_pca) 
#df1 = model.fit_transform(df)
#df2 = model.fit_transform(df1) 
#df3 = model.fit_transform(df2)



In [46]:

    
#pd.DataFrame(df).to_pickle("dataset/tsne_df.pkl")
#pd.DataFrame(df1).to_pickle("dataset/tsne_df1.pkl")
#pd.DataFrame(df2).to_pickle("dataset/tsne_df2.pkl")
#pd.DataFrame(df3).to_pickle("dataset/tsne_df3.pkl")



In [47]:

    
df = pd.read_pickle("dataset/tsne_df.pkl").values
df1 = pd.read_pickle("dataset/tsne_df1.pkl").values
df2 = pd.read_pickle("dataset/tsne_df2.pkl").values
df3 = pd.read_pickle("dataset/tsne_df3.pkl").values



In [48]:

    
#plt.figure(figsize=(15,8))

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10,5))

ax1.scatter(x = df[y_tsne==0,0], y = df[y_tsne==0,1], label = 'Normal')
ax1.scatter(x = df[y_tsne==1,0], y = df[y_tsne==1,1], label = 'Attack')
ax1.title.set_text("After 1000 epochs")

ax2.scatter(x = df1[y_tsne==0,0], y = df1[y_tsne==0,1], label = 'Normal')
ax2.scatter(x = df1[y_tsne==1,0], y = df1[y_tsne==1,1], label = 'Attack')
ax2.title.set_text("After 2000 epochs")

ax3.scatter(x = df2[y_tsne==0,0], y = df2[y_tsne==0,1], label = 'Normal')
ax3.scatter(x = df2[y_tsne==1,0], y = df2[y_tsne==1,1], label = 'Attack')
ax3.title.set_text("After 3000 epochs")

ax4.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
ax4.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
ax4.title.set_text("After 4000 epochs")

plt.subplots_adjust(wspace=0.05, hspace=0.18)
ax1.legend(loc=0)









    Out[48]:





<matplotlib.legend.Legend at 0x7fd2a7ffc5f8>



In [49]:

    
plt.figure(figsize=(15,8))
plt.scatter(x = df3[y_tsne==0,0], y = df3[y_tsne==0,1], label = 'Normal')
plt.scatter(x = df3[y_tsne==1,0], y = df3[y_tsne==1,1], label = 'Attack')
plt.title("After 4000 epochs")









    Out[49]:





Text(0.5, 1.0, 'After 4000 epochs')



In [50]:

    
preprocessing.kdd_train_2labels.to_pickle("dataset/kdd_train_2labels.pkl")
preprocessing.kdd_train_2labels_y.to_pickle("dataset/kdd_train_2labels_y.pkl")

preprocessing.kdd_train_5labels.to_pickle("dataset/kdd_train_5labels.pkl")
preprocessing.kdd_train_5labels_y.to_pickle("dataset/kdd_train_5labels_y.pkl")

preprocessing.kdd_train__2labels.to_pickle("dataset/kdd_train__2labels.pkl")
preprocessing.kdd_train__2labels_y.to_pickle("dataset/kdd_train__2labels_y.pkl")

preprocessing.kdd_train__5labels.to_pickle("dataset/kdd_train__5labels.pkl")
preprocessing.kdd_train__5labels_y.to_pickle("dataset/kdd_train__5labels_y.pkl")



In [51]:

    
preprocessing.kdd_test_5labels_y.to_pickle("dataset/kdd_test_5labels_y.pkl")

preprocessing.kdd_test__5labels.to_pickle("dataset/kdd_test__5labels.pkl")
preprocessing.kdd_test__5labels_y.to_pickle("dataset/kdd_test__5labels_y.pkl")



In [52]:

    
dataset.kdd_diff_level_train.to_pickle("dataset/kdd_diff_level_train.pkl")
dataset.kdd_diff_level_test.to_pickle("dataset/kdd_diff_level_test.pkl")



In [ ]:

	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	label
0	0	tcp	ftp_data	SF	491	0	0	0	0	0	...	25	0.17	0.03	0.17	0.00	0.00	0.00	0.05	0.00	normal
1	0	udp	other	SF	146	0	0	0	0	0	...	1	0.00	0.60	0.88	0.00	0.00	0.00	0.00	0.00	normal
2	0	tcp	private	S0	0	0	0	0	0	0	...	26	0.10	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
3	0	tcp	http	SF	232	8153	0	0	0	0	...	255	1.00	0.00	0.03	0.04	0.03	0.01	0.00	0.01	normal
4	0	tcp	http	SF	199	420	0	0	0	0	...	255	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	normal
5	0	tcp	private	REJ	0	0	0	0	0	0	...	19	0.07	0.07	0.00	0.00	0.00	0.00	1.00	1.00	neptune
6	0	tcp	private	S0	0	0	0	0	0	0	...	9	0.04	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
125966	0	tcp	private	S0	0	0	0	0	0	0	...	13	0.05	0.07	0.00	0.00	1.00	1.00	0.00	0.00	neptune
125967	0	tcp	http	SF	359	375	0	0	0	0	...	255	1.00	0.00	0.33	0.04	0.33	0.00	0.00	0.00	normal
125968	0	tcp	private	S0	0	0	0	0	0	0	...	25	0.10	0.06	0.00	0.00	1.00	1.00	0.00	0.00	neptune
125969	8	udp	private	SF	105	145	0	0	0	0	...	244	0.96	0.01	0.01	0.00	0.00	0.00	0.00	0.00	normal
125970	0	tcp	smtp	SF	2231	384	0	0	0	0	...	30	0.12	0.06	0.00	0.00	0.72	0.00	0.01	0.00	normal
125971	0	tcp	klogin	S0	0	0	0	0	0	0	...	8	0.03	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
125972	0	tcp	ftp_data	SF	151	0	0	0	0	0	...	77	0.30	0.03	0.30	0.00	0.00	0.00	0.00	0.00	normal

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	125973.00000	1.259730e+05	1.259730e+05	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	...	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000	125973.000000
mean	287.14465	4.556674e+04	1.977911e+04	0.000198	0.022687	0.000111	0.204409	0.001222	0.395736	0.279250	...	182.148945	115.653005	0.521242	0.082951	0.148379	0.032542	0.284452	0.278485	0.118832	0.120240
std	2604.51531	5.870331e+06	4.021269e+06	0.014086	0.253530	0.014366	2.149968	0.045239	0.489010	23.942042	...	99.206213	110.702741	0.448949	0.188922	0.308997	0.112564	0.444784	0.445669	0.306557	0.319459
min	0.00000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.00000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	82.000000	10.000000	0.050000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.00000	4.400000e+01	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	255.000000	63.000000	0.510000	0.020000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	0.00000	2.760000e+02	5.160000e+02	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	255.000000	255.000000	1.000000	0.070000	0.060000	0.020000	1.000000	1.000000	0.000000	0.000000
max	42908.00000	1.379964e+09	1.309937e+09	1.000000	3.000000	3.000000	77.000000	5.000000	1.000000	7479.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	22544.000000	2.254400e+04	2.254400e+04	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	...	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000	22544.000000
mean	218.859076	1.039545e+04	2.056019e+03	0.000311	0.008428	0.000710	0.105394	0.021647	0.442202	0.119899	...	193.869411	140.750532	0.608722	0.090540	0.132261	0.019638	0.097814	0.099426	0.233385	0.226683
std	1407.176612	4.727864e+05	2.121930e+04	0.017619	0.142599	0.036473	0.928428	0.150328	0.496659	7.269597	...	94.035663	111.783972	0.435688	0.220717	0.306268	0.085394	0.273139	0.281866	0.387229	0.400875
min	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000e+00	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	121.000000	15.000000	0.070000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	5.400000e+01	4.600000e+01	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	255.000000	168.000000	0.920000	0.010000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	0.000000	2.870000e+02	6.010000e+02	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	255.000000	255.000000	1.000000	0.060000	0.030000	0.010000	0.000000	0.000000	0.360000	0.170000
max	57715.000000	6.282565e+07	1.345927e+06	1.000000	3.000000	3.000000	101.000000	4.000000	1.000000	796.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000