notebook.community

Edit and run



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import os,sys
#import csv
import pandas as pan
import cPickle as pickle
import pprint
#import timeit
get_ipython().magic(u'matplotlib inline')
#import tables  #PyTables used to generate HDF5 file instead of pickle



In [2]:

    
rootdir="/home/ilan/Desktop/KDD_1999/"
os.chdir(rootdir)
datadir="/home/ilan/Desktop/KDD_1999/KDD_1999_data/"
filename = 'kddcup.data.corrected'
#filepath = os.path.join(rootdir,filename)

#try:
#    with open(filename,'r') as f:
#        data = csv.reader(f)
#except csv.Error as err:
#    print ("Loading csv file failed",format(err))
#    sys.exit('File %s, line %d: %s' % (filename, reader.line_num, err))
#type(data)

# f4=32bit float, f8=64bit float, i4=32bit integer, i8=64bit integer, a10=10 character string
#coltypes = np.dtype("f8, a10, a10, a10, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, \
#                    i8, i8, i8, i8, i8, i8, i8, f8, f8, f8, f8, f8, f8, f8, i8, i8, f8, f8, \
#                    f8, f8, f8, f8, f8, f8, a10")

# np.genfromtxt can handle missing data but runs much slower and uses much more RAM 
#data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
#data = np.loadtxt(filename, dtype=coltypes, delimiter=',')

pklfile="data.pkl"
#hffile="data.h5"
folderpath=os.path.join(datadir,pklfile)
#folderpath=os.path.join(rootdir,hffile)
if (os.path.exists(folderpath)==True):
    print("Pickle file containing data found. Loading it...")
    data=pickle.load(open(folderpath,'r'))
    #data = tables.open_file(folderpath, driver="H5FD_CORE")
else:
    # np.genfromtxt can handle missing data but runs much slower and uses much more RAM 
    #data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
    #data = np.loadtxt(filename, dtype=coltypes, delimiter=',')
    columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
               "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
               "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
               "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
               "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
               "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
               "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
               "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","Label"]
    # Must disable header which reads column names from a given row in the input file for names flag to work
    # index_col must be set to false otherwise it thinks the first column is the index
    print("Reading in csv file and creating pickle...")
    data = pan.read_csv(filename, delimiter=',',header=None,names=columns,index_col=False)
    # HDF5 unfortunately only supports standard Python/Numpy types, not Pandas DataFrames
    #h5file = tables.openFile(hffile, mode='w', driver="H5FD_CORE")
    #h5file.createArray(h5file.root, "array", data)
    #h5file.close()
    # Remove period in the label column
    data['Label']=data['Label'].apply(lambda x: x.strip('.'))
    with open(pklfile,'wb') as output:
        pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)

data

# This custom function to read in csv files is fastest and requires the least RAM, but requires
#      perfect input data as no allowances for missing or imperfect data are made
#def custom_loadtxt(filename, delimiter=',', skiprows=0, dtype=float):
#    def iter_func():
#        with open(filename, 'r') as infile:
#            for _ in range(skiprows):
#                next(infile)
#            for line in infile:
#                line = line.rstrip().split(delimiter)
#                for counter,item in enumerate(line):
#                    yield dtype[counter](item)
#        custom_loadtxt.rowlength = len(line)
#    data = np.fromiter(iter_func(), dtype=dtype)
#    data = data.reshape((-1, custom_loadtxt.rowlength))
#    return data


#filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
#with open(filename1,'wb') as output:
#    pickle.dump(corpustopics, output)
#topics=pickle.load(open(topicspath,'r'))









    



Pickle file containing data found. Loading it...






    Out[2]:






  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      Label
    
  
  
    
      0      
       0
       tcp
       http
       SF
       215
       45076
       0
       0
       0
       0
      ...
         0
       0
       0
       0.00
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      1      
       0
       tcp
       http
       SF
       162
        4528
       0
       0
       0
       0
      ...
         1
       1
       0
       1.00
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      2      
       0
       tcp
       http
       SF
       236
        1228
       0
       0
       0
       0
      ...
         2
       1
       0
       0.50
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      3      
       0
       tcp
       http
       SF
       233
        2032
       0
       0
       0
       0
      ...
         3
       1
       0
       0.33
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      4      
       0
       tcp
       http
       SF
       239
         486
       0
       0
       0
       0
      ...
         4
       1
       0
       0.25
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      5      
       0
       tcp
       http
       SF
       238
        1282
       0
       0
       0
       0
      ...
         5
       1
       0
       0.20
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      6      
       0
       tcp
       http
       SF
       235
        1337
       0
       0
       0
       0
      ...
         6
       1
       0
       0.17
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      7      
       0
       tcp
       http
       SF
       234
        1364
       0
       0
       0
       0
      ...
         7
       1
       0
       0.14
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      8      
       0
       tcp
       http
       SF
       239
        1295
       0
       0
       0
       0
      ...
         8
       1
       0
       0.12
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      9      
       0
       tcp
       http
       SF
       181
        5450
       0
       0
       0
       0
      ...
         9
       1
       0
       0.11
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      10     
       0
       tcp
       http
       SF
       184
         124
       0
       0
       0
       0
      ...
        10
       1
       0
       0.10
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      11     
       0
       tcp
       http
       SF
       185
        9020
       0
       0
       0
       0
      ...
        11
       1
       0
       0.09
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      12     
       0
       tcp
       http
       SF
       239
        1295
       0
       0
       0
       0
      ...
        12
       1
       0
       0.08
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      13     
       0
       tcp
       http
       SF
       181
        5450
       0
       0
       0
       0
      ...
        13
       1
       0
       0.08
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      14     
       0
       tcp
       http
       SF
       236
        1228
       0
       0
       0
       0
      ...
        14
       1
       0
       0.07
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      15     
       0
       tcp
       http
       SF
       233
        2032
       0
       0
       0
       0
      ...
        15
       1
       0
       0.07
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      16     
       0
       tcp
       http
       SF
       238
        1282
       0
       0
       0
       0
      ...
        16
       1
       0
       0.06
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      17     
       0
       tcp
       http
       SF
       235
        1337
       0
       0
       0
       0
      ...
        17
       1
       0
       0.06
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      18     
       0
       tcp
       http
       SF
       234
        1364
       0
       0
       0
       0
      ...
        18
       1
       0
       0.06
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      19     
       0
       tcp
       http
       SF
       239
         486
       0
       0
       0
       0
      ...
        19
       1
       0
       0.05
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      20     
       0
       tcp
       http
       SF
       185
        9020
       0
       0
       0
       0
      ...
        20
       1
       0
       0.05
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      21     
       0
       tcp
       http
       SF
       184
         124
       0
       0
       0
       0
      ...
        21
       1
       0
       0.05
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      22     
       0
       tcp
       http
       SF
       181
        5450
       0
       0
       0
       0
      ...
        22
       1
       0
       0.05
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      23     
       0
       tcp
       http
       SF
       239
        1295
       0
       0
       0
       0
      ...
        23
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      24     
       0
       tcp
       http
       SF
       236
        1228
       0
       0
       0
       0
      ...
        24
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      25     
       0
       tcp
       http
       SF
       233
        2032
       0
       0
       0
       0
      ...
        25
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      26     
       0
       tcp
       http
       SF
       239
         486
       0
       0
       0
       0
      ...
        26
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      27     
       0
       tcp
       http
       SF
       238
        1282
       0
       0
       0
       0
      ...
        27
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      28     
       0
       tcp
       http
       SF
       234
        1364
       0
       0
       0
       0
      ...
        28
       1
       0
       0.04
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      29     
       0
       tcp
       http
       SF
       235
        1337
       0
       0
       0
       0
      ...
        29
       1
       0
       0.03
       0.00
       0.00
       0.00
       0
       0
       normal
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4898401
       0
       tcp
       http
       SF
       283
         466
       0
       0
       0
       0
      ...
       255
       1
       0
       0.12
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898402
       0
       tcp
       http
       SF
       289
        1096
       0
       0
       0
       0
      ...
       255
       1
       0
       0.11
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898403
       0
       tcp
       http
       SF
       289
        1862
       0
       0
       0
       0
      ...
       255
       1
       0
       0.10
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898404
       0
       tcp
       http
       SF
       203
         242
       0
       0
       0
       0
      ...
       255
       1
       0
       0.09
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898405
       0
       tcp
       http
       S1
       196
           0
       0
       0
       0
       0
      ...
       255
       1
       0
       0.08
       0.05
       0.08
       0.01
       0
       0
       normal
    
    
      4898406
       0
       tcp
       http
       SF
       143
       17463
       0
       0
       0
       0
      ...
       255
       1
       0
       0.08
       0.05
       0.08
       0.01
       0
       0
       normal
    
    
      4898407
       0
       tcp
       http
       SF
       202
        4017
       0
       0
       0
       0
      ...
       255
       1
       0
       0.07
       0.05
       0.07
       0.01
       0
       0
       normal
    
    
      4898408
       0
       tcp
       http
       SF
         0
         234
       0
       0
       0
       0
      ...
       255
       1
       0
       0.07
       0.05
       0.07
       0.01
       0
       0
       normal
    
    
      4898409
       0
       tcp
       http
       SF
       203
        1200
       0
       0
       0
       0
      ...
       255
       1
       0
       0.06
       0.05
       0.06
       0.01
       0
       0
       normal
    
    
      4898410
       0
       tcp
       http
       SF
       198
        2169
       0
       0
       0
       0
      ...
       255
       1
       0
       0.06
       0.05
       0.06
       0.01
       0
       0
       normal
    
    
      4898411
       0
       tcp
       http
       SF
       197
         466
       0
       0
       0
       0
      ...
       255
       1
       0
       0.06
       0.05
       0.06
       0.01
       0
       0
       normal
    
    
      4898412
       0
       tcp
       http
       SF
       203
        1862
       0
       0
       0
       0
      ...
       255
       1
       0
       0.05
       0.05
       0.05
       0.01
       0
       0
       normal
    
    
      4898413
       0
       tcp
       http
       SF
       203
        1096
       0
       0
       0
       0
      ...
       255
       1
       0
       0.05
       0.05
       0.05
       0.01
       0
       0
       normal
    
    
      4898414
       0
       tcp
       http
       SF
       284
        2286
       0
       0
       0
       0
      ...
       255
       1
       0
       0.05
       0.05
       0.05
       0.01
       0
       0
       normal
    
    
      4898415
       0
       tcp
       http
       SF
       290
        4017
       0
       0
       0
       0
      ...
       255
       1
       0
       0.05
       0.05
       0.05
       0.01
       0
       0
       normal
    
    
      4898416
       0
       tcp
       http
       SF
       291
         234
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898417
       0
       tcp
       http
       SF
       291
         242
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898418
       0
       tcp
       http
       SF
       231
       17463
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898419
       0
       tcp
       http
       SF
       291
        1200
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898420
       0
       tcp
       http
       SF
       285
         466
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898421
       0
       tcp
       http
       SF
       286
        2169
       0
       0
       0
       0
      ...
       255
       1
       0
       0.04
       0.05
       0.04
       0.01
       0
       0
       normal
    
    
      4898422
       0
       tcp
       http
       SF
       291
        1096
       0
       0
       0
       0
      ...
       255
       1
       0
       0.03
       0.05
       0.03
       0.01
       0
       0
       normal
    
    
      4898423
       0
       tcp
       http
       SF
       291
        1862
       0
       0
       0
       0
      ...
       255
       1
       0
       0.03
       0.05
       0.03
       0.01
       0
       0
       normal
    
    
      4898424
       0
       tcp
       http
       SF
       159
       15808
       0
       0
       0
       0
      ...
       255
       1
       0
       1.00
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898425
       0
       tcp
       http
       SF
       219
         244
       0
       0
       0
       0
      ...
       255
       1
       0
       0.50
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898426
       0
       tcp
       http
       SF
       212
        2288
       0
       0
       0
       0
      ...
       255
       1
       0
       0.33
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898427
       0
       tcp
       http
       SF
       219
         236
       0
       0
       0
       0
      ...
       255
       1
       0
       0.25
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898428
       0
       tcp
       http
       SF
       218
        3610
       0
       0
       0
       0
      ...
       255
       1
       0
       0.20
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898429
       0
       tcp
       http
       SF
       219
        1234
       0
       0
       0
       0
      ...
       255
       1
       0
       0.17
       0.05
       0.00
       0.01
       0
       0
       normal
    
    
      4898430
       0
       tcp
       http
       SF
       219
        1098
       0
       0
       0
       0
      ...
       255
       1
       0
       0.14
       0.05
       0.00
       0.01
       0
       0
       normal
    
  

4898431 rows × 42 columns



In [3]:

    
# All the labels in the data, and their counts
labelcounts=data['Label'].value_counts()
print labelcounts
labelcounts.plot(kind='bar')









    



smurf              2807886
neptune            1072017
normal              972781
satan                15892
ipsweep              12481
portsweep            10413
nmap                  2316
back                  2203
warezclient           1020
teardrop               979
pod                    264
guess_passwd            53
buffer_overflow         30
land                    21
warezmaster             20
imap                    12
rootkit                 10
loadmodule               9
ftp_write                8
multihop                 7
phf                      4
perl                     3
spy                      2
dtype: int64






    Out[3]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fe6aab23bd0>



In [4]:

    
# Form the Numpy arrays that scikit-learn requires as input for its classification algorithms
# Internally, Pandas DataFrames are Numpy arrays, use the .values suffix to get these

# X will be the (transaction x features) array (excluding the labels, those will go in y)
# We will exclude features which are strings ("protocol_type","service","flag") for now
X = data[["duration","src_bytes","dst_bytes","land","wrong_fragment",
               "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
               "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
               "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
               "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
               "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
               "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
               "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"]].values

# y will be the (transactions x 1) vector with the label of each transaction
y = data['Label'].values

# target_names will be the vector with the possible labels/classes
target_names = ["back","buffer_overflow","ftp_write","guess_passwd","imap","ipsweep","land","loadmodule",
                "multihop","neptune","nmap","normal","perl","phf","pod","portsweep","rootkit","satan",
                "smurf","spy","teardrop","warezclient","warezmaster"]



In [5]:

    
print "Size of X (nsamples, nfeatures):", X.shape
print "Excerpt of X"
print X[:10,:10]
print "Size of y (nsamples):", y.shape
print "Class names:", target_names









    



Size of X (nsamples, nfeatures): (4898431, 38)
Excerpt of X
[[  0.00000000e+00   2.15000000e+02   4.50760000e+04   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   1.62000000e+02   4.52800000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.36000000e+02   1.22800000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.33000000e+02   2.03200000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.39000000e+02   4.86000000e+02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.38000000e+02   1.28200000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.35000000e+02   1.33700000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.34000000e+02   1.36400000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.39000000e+02   1.29500000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   1.81000000e+02   5.45000000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]]
Size of y (nsamples): (4898431,)
Class names: ['back', 'buffer_overflow', 'ftp_write', 'guess_passwd', 'imap', 'ipsweep', 'land', 'loadmodule', 'multihop', 'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep', 'rootkit', 'satan', 'smurf', 'spy', 'teardrop', 'warezclient', 'warezmaster']



In [6]:

    
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from random import sample

# Form a subset of the data by sampling without replacement, to explore model performances with limited computation
# (and also, using the full data set gives a Memory Error)

# number of entries corresponding to 1%
ents = int(len(X)*0.1)
# To take the first elements:
#X_small = X[:ents]
#y_small = y[:ents]
# Take a random sample of size 1% from the data
xs = sample(range(0,len(X)-1),ents)
X_small=X[xs]
y_small=y[xs]

distinct, counts = np.unique(y_small, return_counts=True)
pairs = zip(distinct,counts)
#print sorted(pairs,key=lambda x: x[1],reverse=True)
#print np.asarray((distinct, counts)).T

fulldistinct, fullcounts = np.unique(y, return_counts=True)
fullpairs = zip(fulldistinct, fullcounts)
#print sorted(fullpairs, key=lambda x: x[1], reverse=True)

#comparet=[[types1, fullcount, count] for (types1, fullcount) in fullpairs
#          for (types2, count) in pairs if types1==types2]
#comparet = np.concatenate((list(fullpairs),list(pairs[:,1])),axis=1)
#print sorted(comparet, key=lambda x: x[1], reverse=True)

# For each class show count in full data set, in sampled subset, and % in full data set and in subset
# This is done to ensure that our sample contains all the main classes of the full data set,
#     in roughly the same proportions
# If so we can proceed with this sample
#     and have some confidence that the relative accuracies of the models in the cross validation
#     will be representative of those which would be obtained from the full training set
comparet1=[[types1, fullcount, count, "%.2G" % (float(fullcount)/float(len(y))*100.)+'%', "%.2G" % (float(count)/float(len(y_small))*100.)+'%']
            for (types1, fullcount) in fullpairs
          for (types2, count) in pairs if types1==types2]
pprint.pprint(sorted(comparet1, key=lambda x: x[1], reverse=True))

#train scaler and apply to data
scaler = StandardScaler()
scaler.fit(X_small)
X_scaled = scaler.transform(X_small)









    



[['smurf', 2807886, 280838, '57%', '57%'],
 ['neptune', 1072017, 107027, '22%', '22%'],
 ['normal', 972781, 97347, '20%', '20%'],
 ['satan', 15892, 1670, '0.32%', '0.34%'],
 ['ipsweep', 12481, 1221, '0.25%', '0.25%'],
 ['portsweep', 10413, 1015, '0.21%', '0.21%'],
 ['nmap', 2316, 257, '0.047%', '0.052%'],
 ['back', 2203, 212, '0.045%', '0.043%'],
 ['warezclient', 1020, 97, '0.021%', '0.02%'],
 ['teardrop', 979, 103, '0.02%', '0.021%'],
 ['pod', 264, 30, '0.0054%', '0.0061%'],
 ['guess_passwd', 53, 7, '0.0011%', '0.0014%'],
 ['buffer_overflow', 30, 5, '0.00061%', '0.001%'],
 ['land', 21, 3, '0.00043%', '0.00061%'],
 ['warezmaster', 20, 3, '0.00041%', '0.00061%'],
 ['imap', 12, 1, '0.00024%', '0.0002%'],
 ['rootkit', 10, 1, '0.0002%', '0.0002%'],
 ['loadmodule', 9, 1, '0.00018%', '0.0002%'],
 ['ftp_write', 8, 3, '0.00016%', '0.00061%'],
 ['phf', 4, 1, '8.2E-05%', '0.0002%'],
 ['spy', 2, 1, '4.1E-05%', '0.0002%']]



In [7]:

    
# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, target_names):
    ncls = len(target_names)
    cm = confusion_matrix(y_true, y_pred)

    norm_conf = []
    for i in cm:
        #print i
        a = 0
        tmp_arr = []
        a = sum(i, 0)
        for j in i:
            if (a==0): tmp_arr.append(0)
            elif (a>0): tmp_arr.append(float(j)/float(a))
        norm_conf.append(tmp_arr)

    fig = plt.figure(figsize=(12,12))
    plt.clf()
    ax = fig.add_subplot(111)
    ax.set_aspect(1)
    res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
                    interpolation='nearest')

    width = len(cm)
    height = len(cm[0])

    for x in xrange(ncls):
        for y in xrange(ncls):
            ax.annotate(str(cm[x][y]), xy=(y, x), 
                        horizontalalignment='center',
                        verticalalignment='center',
                        fontsize=8)

    cb = fig.colorbar(res)

    plt.imshow(cm, cmap='jet')
    plt.xticks(np.arange(ncls), target_names, fontsize=16,rotation='vertical')
    plt.yticks(np.arange(ncls), target_names, fontsize=16)

    plt.xlabel("Predicted label", fontsize=20)
    plt.ylabel("True label", fontsize=20)
    plt.title("Confusion matrix", fontsize=22)
    fig=plt.figure()

    return fig



In [8]:

    
# Prepare for cross validation runs
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from sklearn.metrics import accuracy_score #Accuracy metric (fraction correct)
from sklearn.cross_validation import KFold # Cross validation
from sklearn.cross_validation import StratifiedKFold # Cross validation
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report

#cross_val = KFold(len(y), 5)

from sklearn.cross_validation import train_test_split

def cross_validate(X,y,kfolds,model,**kwargs):
    """Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
    assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
    if (kfolds==1): 
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        #X_train, X_test = X[train_idx], X[test_idx]
        #y_train, y_test = y[train_idx], y[test_idx]
        #train_idx,test_idx = train_test_split(range(0,len(y)-1))
        #cross_val = zip(train_idx,test_idx)
        cross_val = [[0,0]]
        cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
    elif (kfolds>=2):
            #cross_val = KFold(len(y),kfolds)
            cross_val = StratifiedKFold(y,kfolds)
    y_true = []
    y_pred = []
    k_fold_acc = []
    for train_idx, test_idx in cross_val:
        #select training and test data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        #train scaler and apply to both sets
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        clf = model(**kwargs)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        y_true = np.append(y_true, y_test)
        y_pred = np.append(y_pred, pred)
        k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
        print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
    print "Mean accuracy of "+str(model)+" over "+str(kfolds)+" k-folds: %2.3f" % np.mean(k_fold_acc)
    print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
    print "Classification report for "+str(model)
    print classification_report(y_true, y_pred, target_names=target_names)
    #print "Confusion matrix for "+str(model)
    #pprint.pprint(confusion_matrix(y_true, y_pred))
    plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))
    
    
# We also form the cross validation samples and run the ML algorithms in separate functions, to use if desired.
#    This is so that: a) cross validation samples aren't unnecessarily regenerated for each algorithm
#                     b) the comparison between the models is for exactly the same training and test data sets
#                        (accuracies for individual k-folds can then be compared)

def form_cross_val_set(X,y,kfolds):
    """Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
    assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
    if (kfolds==1): 
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        #X_train, X_test = X[train_idx], X[test_idx]
        #y_train, y_test = y[train_idx], y[test_idx]
        #train_idx,test_idx = train_test_split(range(0,len(y)-1))
        #cross_val = zip(train_idx,test_idx)
        cross_val = [[0,0]]
        cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
    elif (kfolds>=2):
            #cross_val = KFold(len(y),kfolds)
            cross_val = StratifiedKFold(y,kfolds)
    return cross_val
            
def run_model(cross_val_object, model,**kwargs):        
    y_true = []
    y_pred = []
    k_fold_acc = []
    for train_idx, test_idx in cross_val_object:
        #select training and test data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        #train scaler and apply to both sets
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        clf = model(**kwargs)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        y_true = np.append(y_true, y_test)
        y_pred = np.append(y_pred, pred)
        k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
        print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
    print "Mean accuracy of "+str(model)+" over all k-folds: %2.3f" % np.mean(k_fold_acc)
    print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
    print "Classification report for "+str(model)
    print classification_report(y_true, y_pred, target_names=target_names)
    #print "Confusion matrix for "+str(model)
    #pprint.pprint(confusion_matrix(y_true, y_pred))
    plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))



In [9]:

    
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import SGDClassifier as SGD
# Perceptron() is actually equivalent to:
#    SGDClassifier(loss=”perceptron”, eta0=1, learning_rate=”constant”, penalty=None).
from sklearn.linear_model import Perceptron
#from sklearn.svm import LinearSVC as SVM
#from sklearn.neighbors import KNeighborsClassifier as KNN
#from sklearn.linear_model import LogisticRegression

cv = form_cross_val_set(X=X,y=y,kfolds=5)
run_model(cross_val_object=cv, model=RF, n_jobs=4)
run_model(cross_val_object=cv, model=SGD, n_jobs=4, class_weight='auto')
run_model(cross_val_object=cv, model=Perceptron, n_jobs=4, class_weight='auto')

# To run each with a different underlying cross validation data sets:
#cross_validate(X=X,y=y,kfolds=5,model=RF,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=SGD,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=Perceptron,n_jobs=4)
#cross_validate(X=X_small,y=y_small,kfolds=1,model=SVM)









    



Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.958
Mean accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> over all k-folds: 0.991
For comparison, accuracy from purely random classification: 0.043





    



/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:417: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:959: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)






    



Classification report for <class 'sklearn.ensemble.forest.RandomForestClassifier'>
                 precision    recall  f1-score   support

           back       1.00      1.00      1.00      2203
buffer_overflow       0.76      0.53      0.63        30
      ftp_write       1.00      0.25      0.40         8
   guess_passwd       1.00      0.92      0.96        53
           imap       0.80      0.67      0.73        12
        ipsweep       1.00      0.87      0.93     12481
           land       0.56      0.43      0.49        21
     loadmodule       1.00      0.11      0.20         9
       multihop       0.33      0.14      0.20         7
        neptune       1.00      0.81      0.89   1072017
           nmap       0.73      0.82      0.77      2316
         normal       0.86      1.00      0.93    972781
           perl       0.67      0.67      0.67         3
            phf       1.00      0.75      0.86         4
            pod       0.98      0.97      0.98       264
      portsweep       0.18      0.97      0.31     10413
        rootkit       0.00      0.00      0.00        10
          satan       0.79      0.99      0.88     15892
          smurf       1.00      1.00      1.00   2807886
            spy       0.00      0.00      0.00         2
       teardrop       1.00      0.99      0.99       979
    warezclient       0.98      0.90      0.94      1020
    warezmaster       1.00      0.65      0.79        20

    avg / total       0.97      0.96      0.96   4898431

Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Mean accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> over all k-folds: 0.573
For comparison, accuracy from purely random classification: 0.043
Classification report for <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
                 precision    recall  f1-score   support

           back       0.00      0.00      0.00      2203
buffer_overflow       0.06      0.60      0.11        30
      ftp_write       0.00      0.00      0.00         8
   guess_passwd       0.00      0.00      0.00        53
           imap       0.00      0.00      0.00        12
        ipsweep       0.00      0.00      0.00     12481
           land       0.63      0.57      0.60        21
     loadmodule       0.00      0.00      0.00         9
       multihop       0.00      0.00      0.00         7
        neptune       0.00      0.00      0.00   1072017
           nmap       0.00      0.00      0.00      2316
         normal       0.00      0.00      0.00    972781
           perl       0.33      0.67      0.44         3
            phf       1.00      0.25      0.40         4
            pod       0.00      0.00      0.00       264
      portsweep       0.00      0.00      0.00     10413
        rootkit       0.00      0.00      0.00        10
          satan       0.00      0.00      0.00     15892
          smurf       0.57      1.00      0.73   2807886
            spy       0.00      0.00      0.00         2
       teardrop       1.00      0.79      0.88       979
    warezclient       0.00      0.00      0.00      1020
    warezmaster       0.00      0.00      0.00        20

    avg / total       0.33      0.57      0.42   4898431

Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.990
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.707
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.796
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.827
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.810
Mean accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> over all k-folds: 0.826
For comparison, accuracy from purely random classification: 0.043
Classification report for <class 'sklearn.linear_model.perceptron.Perceptron'>
                 precision    recall  f1-score   support

           back       0.04      0.34      0.07      2203
buffer_overflow       0.02      0.40      0.05        30
      ftp_write       0.00      0.25      0.00         8
   guess_passwd       0.03      0.77      0.06        53
           imap       0.00      0.33      0.01        12
        ipsweep       0.66      0.61      0.64     12481
           land       0.01      0.81      0.02        21
     loadmodule       0.00      0.00      0.00         9
       multihop       0.00      0.00      0.00         7
        neptune       0.97      0.80      0.88   1072017
           nmap       0.04      0.32      0.07      2316
         normal       0.99      0.85      0.91    972781
           perl       0.00      0.00      0.00         3
            phf       0.00      0.25      0.00         4
            pod       0.00      0.01      0.00       264
      portsweep       0.03      0.79      0.06     10413
        rootkit       0.00      0.10      0.00        10
          satan       0.78      0.74      0.76     15892
          smurf       1.00      0.80      0.89   2807886
            spy       0.00      0.00      0.00         2
       teardrop       0.67      1.00      0.80       979
    warezclient       0.01      0.10      0.01      1020
    warezmaster       0.00      0.10      0.00        20

    avg / total       0.99      0.81      0.89   4898431







    












    





<matplotlib.figure.Figure at 0x7fe621932c50>






    












    





<matplotlib.figure.Figure at 0x7fe62060dbd0>






    












    





<matplotlib.figure.Figure at 0x7fe620334350>



In [ ]:

    
#from sklearn.neighbors import KNeighborsClassifier #The classification algorithm
# We of course should not be testing on the training set, but we are only checking that the code runs as expected
#K = 1 # number of nearest neighbours (Normally we'd optimize this in an inner loop!)
#Fit classifier and test it on the data
#clf = KNeighborsClassifier(K)
#clf.fit(X_scaled, y_small)
#y_pred = clf.predict(X_scaled)

#calculate the percentage correctly labelled samples
#print "Accuracy: %2.3f" % accuracy_score(y_small, y_pred)



In [ ]:

    
# Now do it properly, with k-fold cross validation
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 1 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
#cross_val = KFold(len(y_small), 5)

# loop over fold indexes 
# train on trainingset, test on testset
# compute accuracy
y_true = []
y_pred = []
for train_idx, test_idx in cross_val:
    #select training and test data
    X_train, X_test = X_small[train_idx], X_small[test_idx]
    y_train, y_test = y_small[train_idx], y_small[test_idx]
    #train scaler and apply to both sets
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    #train and test classifier
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    y_true = np.append(y_true, y_test)
    y_pred = np.append(y_pred, pred)

print "Accuracy: %2.3f" % accuracy_score(y_true, y_pred)



In [ ]:

    
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.ensemble import RandomForestClassifier

#Define classifier
clf = RandomForestClassifier(n_estimators=10,max_features='auto',max_depth=None,n_jobs=4)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)



In [ ]:

    
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 5 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)



In [ ]:

    
# We can do the same using much less code using scikit-learn's pipelines
from sklearn import svm
#Define classifier
# LinearSVC(), SVC(), NuSVC()
clf = svm.LinearSVC(dual=False,fit_intercept=True)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)



In [ ]:

    
# Compute some evaluation metrics

print classification_report(y_true, y_pred, target_names=target_names)
print "Confusion matrix"
print confusion_matrix(y_true, y_pred)
print "For comparison, precision from purely random classification: ",1.0/float(len(target_names))

plot_confusion_matrix(y_true, y_pred, target_names)



In [ ]:

    
# Plot the classification boundary between two of the features
# Beware: this implies training the algorithm on the subset of features of interest, so not computationally negligible

from matplotlib.colors import ListedColormap
#import matplotlib.cm as cmx
#import matplotlib.colors as colors

# Choose size of data subset to use
entries = int(len(X)*0.01)
data_subset = data[:entries]

# Choose the two features to consider
X_plot = data_subset[["num_failed_logins","logged_in"]].values
y_plot = data_subset['Label'].values

h = .01  # step size in the plotting mesh

# Create color maps
#cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
#cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


# 128 colours, for future use if required
colrs=  ["#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
        "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
        "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
        "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
        "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
        "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
        "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
        "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",

        "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
        "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
        "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
        "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
        "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C",
        "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800",
        "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51",
        "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", "#012C58"]


# We train the algorithm and plot its decision boundaries in the two features, no testing involved

n_neighbours = 1 # Number of nearest neighbours
for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    
    scaler_plot = StandardScaler().fit(X_plot)
    X_plot = scaler_plot.transform(X_plot)
    
    # Train the model
    clf_plot = KNeighborsClassifier(n_neighbours, weights=weights)
    clf_plot.fit(X_plot, y_plot)

    # Generate a mesh [x_min, m_max]x[y_min, y_max] and predict the label for each point
    x_min, x_max = X_plot[:, 0].min() - 1, X_plot[:, 0].max() + 1
    y_min, y_max = X_plot[:, 1].min() - 1, X_plot[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf_plot.predict(np.c_[xx.ravel(), yy.ravel()])

    # Count the number of distinct classes found, use this to select the number of colours required to plot regions
    colrs_req = ListedColormap(colrs[:len(set(Z))])
    #print colrs[:len(set(Z))]
    
    # Put the result into a colour plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=colrs_req)
    
    # Map classes of points used (strings) into numbers to allow for plot function to work
    
    #for counter,i in enumerate(y_plot):

    # Plot also the training points
    plt.scatter(X_plot[:, 0], X_plot[:, 1])
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("Two-feature classification boundary (k = %i, weights = '%s')"
              % (n_neighbours, weights))

plt.show()


# In[ ]:

	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	Label
0	0	tcp	http	SF	215	45076	0	0	0	0	...	0	0	0	0.00	0.00	0.00	0.00	0	0	normal
1	0	tcp	http	SF	162	4528	0	0	0	0	...	1	1	0	1.00	0.00	0.00	0.00	0	0	normal
2	0	tcp	http	SF	236	1228	0	0	0	0	...	2	1	0	0.50	0.00	0.00	0.00	0	0	normal
3	0	tcp	http	SF	233	2032	0	0	0	0	...	3	1	0	0.33	0.00	0.00	0.00	0	0	normal
4	0	tcp	http	SF	239	486	0	0	0	0	...	4	1	0	0.25	0.00	0.00	0.00	0	0	normal
5	0	tcp	http	SF	238	1282	0	0	0	0	...	5	1	0	0.20	0.00	0.00	0.00	0	0	normal
6	0	tcp	http	SF	235	1337	0	0	0	0	...	6	1	0	0.17	0.00	0.00	0.00	0	0	normal
7	0	tcp	http	SF	234	1364	0	0	0	0	...	7	1	0	0.14	0.00	0.00	0.00	0	0	normal
8	0	tcp	http	SF	239	1295	0	0	0	0	...	8	1	0	0.12	0.00	0.00	0.00	0	0	normal
9	0	tcp	http	SF	181	5450	0	0	0	0	...	9	1	0	0.11	0.00	0.00	0.00	0	0	normal
10	0	tcp	http	SF	184	124	0	0	0	0	...	10	1	0	0.10	0.00	0.00	0.00	0	0	normal
11	0	tcp	http	SF	185	9020	0	0	0	0	...	11	1	0	0.09	0.00	0.00	0.00	0	0	normal
12	0	tcp	http	SF	239	1295	0	0	0	0	...	12	1	0	0.08	0.00	0.00	0.00	0	0	normal
13	0	tcp	http	SF	181	5450	0	0	0	0	...	13	1	0	0.08	0.00	0.00	0.00	0	0	normal
14	0	tcp	http	SF	236	1228	0	0	0	0	...	14	1	0	0.07	0.00	0.00	0.00	0	0	normal
15	0	tcp	http	SF	233	2032	0	0	0	0	...	15	1	0	0.07	0.00	0.00	0.00	0	0	normal
16	0	tcp	http	SF	238	1282	0	0	0	0	...	16	1	0	0.06	0.00	0.00	0.00	0	0	normal
17	0	tcp	http	SF	235	1337	0	0	0	0	...	17	1	0	0.06	0.00	0.00	0.00	0	0	normal
18	0	tcp	http	SF	234	1364	0	0	0	0	...	18	1	0	0.06	0.00	0.00	0.00	0	0	normal
19	0	tcp	http	SF	239	486	0	0	0	0	...	19	1	0	0.05	0.00	0.00	0.00	0	0	normal
20	0	tcp	http	SF	185	9020	0	0	0	0	...	20	1	0	0.05	0.00	0.00	0.00	0	0	normal
21	0	tcp	http	SF	184	124	0	0	0	0	...	21	1	0	0.05	0.00	0.00	0.00	0	0	normal
22	0	tcp	http	SF	181	5450	0	0	0	0	...	22	1	0	0.05	0.00	0.00	0.00	0	0	normal
23	0	tcp	http	SF	239	1295	0	0	0	0	...	23	1	0	0.04	0.00	0.00	0.00	0	0	normal
24	0	tcp	http	SF	236	1228	0	0	0	0	...	24	1	0	0.04	0.00	0.00	0.00	0	0	normal
25	0	tcp	http	SF	233	2032	0	0	0	0	...	25	1	0	0.04	0.00	0.00	0.00	0	0	normal
26	0	tcp	http	SF	239	486	0	0	0	0	...	26	1	0	0.04	0.00	0.00	0.00	0	0	normal
27	0	tcp	http	SF	238	1282	0	0	0	0	...	27	1	0	0.04	0.00	0.00	0.00	0	0	normal
28	0	tcp	http	SF	234	1364	0	0	0	0	...	28	1	0	0.04	0.00	0.00	0.00	0	0	normal
29	0	tcp	http	SF	235	1337	0	0	0	0	...	29	1	0	0.03	0.00	0.00	0.00	0	0	normal
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4898401	0	tcp	http	SF	283	466	0	0	0	0	...	255	1	0	0.12	0.05	0.00	0.01	0	0	normal
4898402	0	tcp	http	SF	289	1096	0	0	0	0	...	255	1	0	0.11	0.05	0.00	0.01	0	0	normal
4898403	0	tcp	http	SF	289	1862	0	0	0	0	...	255	1	0	0.10	0.05	0.00	0.01	0	0	normal
4898404	0	tcp	http	SF	203	242	0	0	0	0	...	255	1	0	0.09	0.05	0.00	0.01	0	0	normal
4898405	0	tcp	http	S1	196	0	0	0	0	0	...	255	1	0	0.08	0.05	0.08	0.01	0	0	normal
4898406	0	tcp	http	SF	143	17463	0	0	0	0	...	255	1	0	0.08	0.05	0.08	0.01	0	0	normal
4898407	0	tcp	http	SF	202	4017	0	0	0	0	...	255	1	0	0.07	0.05	0.07	0.01	0	0	normal
4898408	0	tcp	http	SF	0	234	0	0	0	0	...	255	1	0	0.07	0.05	0.07	0.01	0	0	normal
4898409	0	tcp	http	SF	203	1200	0	0	0	0	...	255	1	0	0.06	0.05	0.06	0.01	0	0	normal
4898410	0	tcp	http	SF	198	2169	0	0	0	0	...	255	1	0	0.06	0.05	0.06	0.01	0	0	normal
4898411	0	tcp	http	SF	197	466	0	0	0	0	...	255	1	0	0.06	0.05	0.06	0.01	0	0	normal
4898412	0	tcp	http	SF	203	1862	0	0	0	0	...	255	1	0	0.05	0.05	0.05	0.01	0	0	normal
4898413	0	tcp	http	SF	203	1096	0	0	0	0	...	255	1	0	0.05	0.05	0.05	0.01	0	0	normal
4898414	0	tcp	http	SF	284	2286	0	0	0	0	...	255	1	0	0.05	0.05	0.05	0.01	0	0	normal
4898415	0	tcp	http	SF	290	4017	0	0	0	0	...	255	1	0	0.05	0.05	0.05	0.01	0	0	normal
4898416	0	tcp	http	SF	291	234	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898417	0	tcp	http	SF	291	242	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898418	0	tcp	http	SF	231	17463	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898419	0	tcp	http	SF	291	1200	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898420	0	tcp	http	SF	285	466	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898421	0	tcp	http	SF	286	2169	0	0	0	0	...	255	1	0	0.04	0.05	0.04	0.01	0	0	normal
4898422	0	tcp	http	SF	291	1096	0	0	0	0	...	255	1	0	0.03	0.05	0.03	0.01	0	0	normal
4898423	0	tcp	http	SF	291	1862	0	0	0	0	...	255	1	0	0.03	0.05	0.03	0.01	0	0	normal
4898424	0	tcp	http	SF	159	15808	0	0	0	0	...	255	1	0	1.00	0.05	0.00	0.01	0	0	normal
4898425	0	tcp	http	SF	219	244	0	0	0	0	...	255	1	0	0.50	0.05	0.00	0.01	0	0	normal
4898426	0	tcp	http	SF	212	2288	0	0	0	0	...	255	1	0	0.33	0.05	0.00	0.01	0	0	normal
4898427	0	tcp	http	SF	219	236	0	0	0	0	...	255	1	0	0.25	0.05	0.00	0.01	0	0	normal
4898428	0	tcp	http	SF	218	3610	0	0	0	0	...	255	1	0	0.20	0.05	0.00	0.01	0	0	normal
4898429	0	tcp	http	SF	219	1234	0	0	0	0	...	255	1	0	0.17	0.05	0.00	0.01	0	0	normal
4898430	0	tcp	http	SF	219	1098	0	0	0	0	...	255	1	0	0.14	0.05	0.00	0.01	0	0	normal