In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os,sys
#import csv
import pandas as pan
import cPickle as pickle
import pprint
#import timeit
get_ipython().magic(u'matplotlib inline')
#import tables  #PyTables used to generate HDF5 file instead of pickle

In [2]:
rootdir="/home/ilan/Desktop/KDD_1999/"
os.chdir(rootdir)
datadir="/home/ilan/Desktop/KDD_1999/KDD_1999_data/"
filename = 'kddcup.data.corrected'
#filepath = os.path.join(rootdir,filename)

#try:
#    with open(filename,'r') as f:
#        data = csv.reader(f)
#except csv.Error as err:
#    print ("Loading csv file failed",format(err))
#    sys.exit('File %s, line %d: %s' % (filename, reader.line_num, err))
#type(data)

# f4=32bit float, f8=64bit float, i4=32bit integer, i8=64bit integer, a10=10 character string
#coltypes = np.dtype("f8, a10, a10, a10, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, \
#                    i8, i8, i8, i8, i8, i8, i8, f8, f8, f8, f8, f8, f8, f8, i8, i8, f8, f8, \
#                    f8, f8, f8, f8, f8, f8, a10")

# np.genfromtxt can handle missing data but runs much slower and uses much more RAM 
#data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
#data = np.loadtxt(filename, dtype=coltypes, delimiter=',')

pklfile="data.pkl"
#hffile="data.h5"
folderpath=os.path.join(datadir,pklfile)
#folderpath=os.path.join(rootdir,hffile)
if (os.path.exists(folderpath)==True):
    print("Pickle file containing data found. Loading it...")
    data=pickle.load(open(folderpath,'r'))
    #data = tables.open_file(folderpath, driver="H5FD_CORE")
else:
    # np.genfromtxt can handle missing data but runs much slower and uses much more RAM 
    #data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
    #data = np.loadtxt(filename, dtype=coltypes, delimiter=',')
    columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
               "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
               "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
               "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
               "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
               "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
               "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
               "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","Label"]
    # Must disable header which reads column names from a given row in the input file for names flag to work
    # index_col must be set to false otherwise it thinks the first column is the index
    print("Reading in csv file and creating pickle...")
    data = pan.read_csv(filename, delimiter=',',header=None,names=columns,index_col=False)
    # HDF5 unfortunately only supports standard Python/Numpy types, not Pandas DataFrames
    #h5file = tables.openFile(hffile, mode='w', driver="H5FD_CORE")
    #h5file.createArray(h5file.root, "array", data)
    #h5file.close()
    # Remove period in the label column
    data['Label']=data['Label'].apply(lambda x: x.strip('.'))
    with open(pklfile,'wb') as output:
        pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)

data

# This custom function to read in csv files is fastest and requires the least RAM, but requires
#      perfect input data as no allowances for missing or imperfect data are made
#def custom_loadtxt(filename, delimiter=',', skiprows=0, dtype=float):
#    def iter_func():
#        with open(filename, 'r') as infile:
#            for _ in range(skiprows):
#                next(infile)
#            for line in infile:
#                line = line.rstrip().split(delimiter)
#                for counter,item in enumerate(line):
#                    yield dtype[counter](item)
#        custom_loadtxt.rowlength = len(line)
#    data = np.fromiter(iter_func(), dtype=dtype)
#    data = data.reshape((-1, custom_loadtxt.rowlength))
#    return data


#filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
#with open(filename1,'wb') as output:
#    pickle.dump(corpustopics, output)
#topics=pickle.load(open(topicspath,'r'))


Pickle file containing data found. Loading it...
Out[2]:
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate Label
0 0 tcp http SF 215 45076 0 0 0 0 ... 0 0 0 0.00 0.00 0.00 0.00 0 0 normal
1 0 tcp http SF 162 4528 0 0 0 0 ... 1 1 0 1.00 0.00 0.00 0.00 0 0 normal
2 0 tcp http SF 236 1228 0 0 0 0 ... 2 1 0 0.50 0.00 0.00 0.00 0 0 normal
3 0 tcp http SF 233 2032 0 0 0 0 ... 3 1 0 0.33 0.00 0.00 0.00 0 0 normal
4 0 tcp http SF 239 486 0 0 0 0 ... 4 1 0 0.25 0.00 0.00 0.00 0 0 normal
5 0 tcp http SF 238 1282 0 0 0 0 ... 5 1 0 0.20 0.00 0.00 0.00 0 0 normal
6 0 tcp http SF 235 1337 0 0 0 0 ... 6 1 0 0.17 0.00 0.00 0.00 0 0 normal
7 0 tcp http SF 234 1364 0 0 0 0 ... 7 1 0 0.14 0.00 0.00 0.00 0 0 normal
8 0 tcp http SF 239 1295 0 0 0 0 ... 8 1 0 0.12 0.00 0.00 0.00 0 0 normal
9 0 tcp http SF 181 5450 0 0 0 0 ... 9 1 0 0.11 0.00 0.00 0.00 0 0 normal
10 0 tcp http SF 184 124 0 0 0 0 ... 10 1 0 0.10 0.00 0.00 0.00 0 0 normal
11 0 tcp http SF 185 9020 0 0 0 0 ... 11 1 0 0.09 0.00 0.00 0.00 0 0 normal
12 0 tcp http SF 239 1295 0 0 0 0 ... 12 1 0 0.08 0.00 0.00 0.00 0 0 normal
13 0 tcp http SF 181 5450 0 0 0 0 ... 13 1 0 0.08 0.00 0.00 0.00 0 0 normal
14 0 tcp http SF 236 1228 0 0 0 0 ... 14 1 0 0.07 0.00 0.00 0.00 0 0 normal
15 0 tcp http SF 233 2032 0 0 0 0 ... 15 1 0 0.07 0.00 0.00 0.00 0 0 normal
16 0 tcp http SF 238 1282 0 0 0 0 ... 16 1 0 0.06 0.00 0.00 0.00 0 0 normal
17 0 tcp http SF 235 1337 0 0 0 0 ... 17 1 0 0.06 0.00 0.00 0.00 0 0 normal
18 0 tcp http SF 234 1364 0 0 0 0 ... 18 1 0 0.06 0.00 0.00 0.00 0 0 normal
19 0 tcp http SF 239 486 0 0 0 0 ... 19 1 0 0.05 0.00 0.00 0.00 0 0 normal
20 0 tcp http SF 185 9020 0 0 0 0 ... 20 1 0 0.05 0.00 0.00 0.00 0 0 normal
21 0 tcp http SF 184 124 0 0 0 0 ... 21 1 0 0.05 0.00 0.00 0.00 0 0 normal
22 0 tcp http SF 181 5450 0 0 0 0 ... 22 1 0 0.05 0.00 0.00 0.00 0 0 normal
23 0 tcp http SF 239 1295 0 0 0 0 ... 23 1 0 0.04 0.00 0.00 0.00 0 0 normal
24 0 tcp http SF 236 1228 0 0 0 0 ... 24 1 0 0.04 0.00 0.00 0.00 0 0 normal
25 0 tcp http SF 233 2032 0 0 0 0 ... 25 1 0 0.04 0.00 0.00 0.00 0 0 normal
26 0 tcp http SF 239 486 0 0 0 0 ... 26 1 0 0.04 0.00 0.00 0.00 0 0 normal
27 0 tcp http SF 238 1282 0 0 0 0 ... 27 1 0 0.04 0.00 0.00 0.00 0 0 normal
28 0 tcp http SF 234 1364 0 0 0 0 ... 28 1 0 0.04 0.00 0.00 0.00 0 0 normal
29 0 tcp http SF 235 1337 0 0 0 0 ... 29 1 0 0.03 0.00 0.00 0.00 0 0 normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4898401 0 tcp http SF 283 466 0 0 0 0 ... 255 1 0 0.12 0.05 0.00 0.01 0 0 normal
4898402 0 tcp http SF 289 1096 0 0 0 0 ... 255 1 0 0.11 0.05 0.00 0.01 0 0 normal
4898403 0 tcp http SF 289 1862 0 0 0 0 ... 255 1 0 0.10 0.05 0.00 0.01 0 0 normal
4898404 0 tcp http SF 203 242 0 0 0 0 ... 255 1 0 0.09 0.05 0.00 0.01 0 0 normal
4898405 0 tcp http S1 196 0 0 0 0 0 ... 255 1 0 0.08 0.05 0.08 0.01 0 0 normal
4898406 0 tcp http SF 143 17463 0 0 0 0 ... 255 1 0 0.08 0.05 0.08 0.01 0 0 normal
4898407 0 tcp http SF 202 4017 0 0 0 0 ... 255 1 0 0.07 0.05 0.07 0.01 0 0 normal
4898408 0 tcp http SF 0 234 0 0 0 0 ... 255 1 0 0.07 0.05 0.07 0.01 0 0 normal
4898409 0 tcp http SF 203 1200 0 0 0 0 ... 255 1 0 0.06 0.05 0.06 0.01 0 0 normal
4898410 0 tcp http SF 198 2169 0 0 0 0 ... 255 1 0 0.06 0.05 0.06 0.01 0 0 normal
4898411 0 tcp http SF 197 466 0 0 0 0 ... 255 1 0 0.06 0.05 0.06 0.01 0 0 normal
4898412 0 tcp http SF 203 1862 0 0 0 0 ... 255 1 0 0.05 0.05 0.05 0.01 0 0 normal
4898413 0 tcp http SF 203 1096 0 0 0 0 ... 255 1 0 0.05 0.05 0.05 0.01 0 0 normal
4898414 0 tcp http SF 284 2286 0 0 0 0 ... 255 1 0 0.05 0.05 0.05 0.01 0 0 normal
4898415 0 tcp http SF 290 4017 0 0 0 0 ... 255 1 0 0.05 0.05 0.05 0.01 0 0 normal
4898416 0 tcp http SF 291 234 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898417 0 tcp http SF 291 242 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898418 0 tcp http SF 231 17463 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898419 0 tcp http SF 291 1200 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898420 0 tcp http SF 285 466 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898421 0 tcp http SF 286 2169 0 0 0 0 ... 255 1 0 0.04 0.05 0.04 0.01 0 0 normal
4898422 0 tcp http SF 291 1096 0 0 0 0 ... 255 1 0 0.03 0.05 0.03 0.01 0 0 normal
4898423 0 tcp http SF 291 1862 0 0 0 0 ... 255 1 0 0.03 0.05 0.03 0.01 0 0 normal
4898424 0 tcp http SF 159 15808 0 0 0 0 ... 255 1 0 1.00 0.05 0.00 0.01 0 0 normal
4898425 0 tcp http SF 219 244 0 0 0 0 ... 255 1 0 0.50 0.05 0.00 0.01 0 0 normal
4898426 0 tcp http SF 212 2288 0 0 0 0 ... 255 1 0 0.33 0.05 0.00 0.01 0 0 normal
4898427 0 tcp http SF 219 236 0 0 0 0 ... 255 1 0 0.25 0.05 0.00 0.01 0 0 normal
4898428 0 tcp http SF 218 3610 0 0 0 0 ... 255 1 0 0.20 0.05 0.00 0.01 0 0 normal
4898429 0 tcp http SF 219 1234 0 0 0 0 ... 255 1 0 0.17 0.05 0.00 0.01 0 0 normal
4898430 0 tcp http SF 219 1098 0 0 0 0 ... 255 1 0 0.14 0.05 0.00 0.01 0 0 normal

4898431 rows × 42 columns


In [3]:
# All the labels in the data, and their counts
labelcounts=data['Label'].value_counts()
print labelcounts
labelcounts.plot(kind='bar')


smurf              2807886
neptune            1072017
normal              972781
satan                15892
ipsweep              12481
portsweep            10413
nmap                  2316
back                  2203
warezclient           1020
teardrop               979
pod                    264
guess_passwd            53
buffer_overflow         30
land                    21
warezmaster             20
imap                    12
rootkit                 10
loadmodule               9
ftp_write                8
multihop                 7
phf                      4
perl                     3
spy                      2
dtype: int64
Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe6aab23bd0>

In [4]:
# Form the Numpy arrays that scikit-learn requires as input for its classification algorithms
# Internally, Pandas DataFrames are Numpy arrays, use the .values suffix to get these

# X will be the (transaction x features) array (excluding the labels, those will go in y)
# We will exclude features which are strings ("protocol_type","service","flag") for now
X = data[["duration","src_bytes","dst_bytes","land","wrong_fragment",
               "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
               "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
               "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
               "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
               "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
               "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
               "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"]].values

# y will be the (transactions x 1) vector with the label of each transaction
y = data['Label'].values

# target_names will be the vector with the possible labels/classes
target_names = ["back","buffer_overflow","ftp_write","guess_passwd","imap","ipsweep","land","loadmodule",
                "multihop","neptune","nmap","normal","perl","phf","pod","portsweep","rootkit","satan",
                "smurf","spy","teardrop","warezclient","warezmaster"]

In [5]:
print "Size of X (nsamples, nfeatures):", X.shape
print "Excerpt of X"
print X[:10,:10]
print "Size of y (nsamples):", y.shape
print "Class names:", target_names


Size of X (nsamples, nfeatures): (4898431, 38)
Excerpt of X
[[  0.00000000e+00   2.15000000e+02   4.50760000e+04   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   1.62000000e+02   4.52800000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.36000000e+02   1.22800000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.33000000e+02   2.03200000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.39000000e+02   4.86000000e+02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.38000000e+02   1.28200000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.35000000e+02   1.33700000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.34000000e+02   1.36400000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.39000000e+02   1.29500000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   1.81000000e+02   5.45000000e+03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00]]
Size of y (nsamples): (4898431,)
Class names: ['back', 'buffer_overflow', 'ftp_write', 'guess_passwd', 'imap', 'ipsweep', 'land', 'loadmodule', 'multihop', 'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep', 'rootkit', 'satan', 'smurf', 'spy', 'teardrop', 'warezclient', 'warezmaster']

In [6]:
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from random import sample

# Form a subset of the data by sampling without replacement, to explore model performances with limited computation
# (and also, using the full data set gives a Memory Error)

# number of entries corresponding to 1%
ents = int(len(X)*0.1)
# To take the first elements:
#X_small = X[:ents]
#y_small = y[:ents]
# Take a random sample of size 1% from the data
xs = sample(range(0,len(X)-1),ents)
X_small=X[xs]
y_small=y[xs]

distinct, counts = np.unique(y_small, return_counts=True)
pairs = zip(distinct,counts)
#print sorted(pairs,key=lambda x: x[1],reverse=True)
#print np.asarray((distinct, counts)).T

fulldistinct, fullcounts = np.unique(y, return_counts=True)
fullpairs = zip(fulldistinct, fullcounts)
#print sorted(fullpairs, key=lambda x: x[1], reverse=True)

#comparet=[[types1, fullcount, count] for (types1, fullcount) in fullpairs
#          for (types2, count) in pairs if types1==types2]
#comparet = np.concatenate((list(fullpairs),list(pairs[:,1])),axis=1)
#print sorted(comparet, key=lambda x: x[1], reverse=True)

# For each class show count in full data set, in sampled subset, and % in full data set and in subset
# This is done to ensure that our sample contains all the main classes of the full data set,
#     in roughly the same proportions
# If so we can proceed with this sample
#     and have some confidence that the relative accuracies of the models in the cross validation
#     will be representative of those which would be obtained from the full training set
comparet1=[[types1, fullcount, count, "%.2G" % (float(fullcount)/float(len(y))*100.)+'%', "%.2G" % (float(count)/float(len(y_small))*100.)+'%']
            for (types1, fullcount) in fullpairs
          for (types2, count) in pairs if types1==types2]
pprint.pprint(sorted(comparet1, key=lambda x: x[1], reverse=True))

#train scaler and apply to data
scaler = StandardScaler()
scaler.fit(X_small)
X_scaled = scaler.transform(X_small)


[['smurf', 2807886, 280838, '57%', '57%'],
 ['neptune', 1072017, 107027, '22%', '22%'],
 ['normal', 972781, 97347, '20%', '20%'],
 ['satan', 15892, 1670, '0.32%', '0.34%'],
 ['ipsweep', 12481, 1221, '0.25%', '0.25%'],
 ['portsweep', 10413, 1015, '0.21%', '0.21%'],
 ['nmap', 2316, 257, '0.047%', '0.052%'],
 ['back', 2203, 212, '0.045%', '0.043%'],
 ['warezclient', 1020, 97, '0.021%', '0.02%'],
 ['teardrop', 979, 103, '0.02%', '0.021%'],
 ['pod', 264, 30, '0.0054%', '0.0061%'],
 ['guess_passwd', 53, 7, '0.0011%', '0.0014%'],
 ['buffer_overflow', 30, 5, '0.00061%', '0.001%'],
 ['land', 21, 3, '0.00043%', '0.00061%'],
 ['warezmaster', 20, 3, '0.00041%', '0.00061%'],
 ['imap', 12, 1, '0.00024%', '0.0002%'],
 ['rootkit', 10, 1, '0.0002%', '0.0002%'],
 ['loadmodule', 9, 1, '0.00018%', '0.0002%'],
 ['ftp_write', 8, 3, '0.00016%', '0.00061%'],
 ['phf', 4, 1, '8.2E-05%', '0.0002%'],
 ['spy', 2, 1, '4.1E-05%', '0.0002%']]

In [7]:
# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, target_names):
    ncls = len(target_names)
    cm = confusion_matrix(y_true, y_pred)

    norm_conf = []
    for i in cm:
        #print i
        a = 0
        tmp_arr = []
        a = sum(i, 0)
        for j in i:
            if (a==0): tmp_arr.append(0)
            elif (a>0): tmp_arr.append(float(j)/float(a))
        norm_conf.append(tmp_arr)

    fig = plt.figure(figsize=(12,12))
    plt.clf()
    ax = fig.add_subplot(111)
    ax.set_aspect(1)
    res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
                    interpolation='nearest')

    width = len(cm)
    height = len(cm[0])

    for x in xrange(ncls):
        for y in xrange(ncls):
            ax.annotate(str(cm[x][y]), xy=(y, x), 
                        horizontalalignment='center',
                        verticalalignment='center',
                        fontsize=8)

    cb = fig.colorbar(res)

    plt.imshow(cm, cmap='jet')
    plt.xticks(np.arange(ncls), target_names, fontsize=16,rotation='vertical')
    plt.yticks(np.arange(ncls), target_names, fontsize=16)

    plt.xlabel("Predicted label", fontsize=20)
    plt.ylabel("True label", fontsize=20)
    plt.title("Confusion matrix", fontsize=22)
    fig=plt.figure()

    return fig

In [8]:
# Prepare for cross validation runs
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from sklearn.metrics import accuracy_score #Accuracy metric (fraction correct)
from sklearn.cross_validation import KFold # Cross validation
from sklearn.cross_validation import StratifiedKFold # Cross validation
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report

#cross_val = KFold(len(y), 5)

from sklearn.cross_validation import train_test_split

def cross_validate(X,y,kfolds,model,**kwargs):
    """Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
    assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
    if (kfolds==1): 
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        #X_train, X_test = X[train_idx], X[test_idx]
        #y_train, y_test = y[train_idx], y[test_idx]
        #train_idx,test_idx = train_test_split(range(0,len(y)-1))
        #cross_val = zip(train_idx,test_idx)
        cross_val = [[0,0]]
        cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
    elif (kfolds>=2):
            #cross_val = KFold(len(y),kfolds)
            cross_val = StratifiedKFold(y,kfolds)
    y_true = []
    y_pred = []
    k_fold_acc = []
    for train_idx, test_idx in cross_val:
        #select training and test data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        #train scaler and apply to both sets
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        clf = model(**kwargs)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        y_true = np.append(y_true, y_test)
        y_pred = np.append(y_pred, pred)
        k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
        print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
    print "Mean accuracy of "+str(model)+" over "+str(kfolds)+" k-folds: %2.3f" % np.mean(k_fold_acc)
    print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
    print "Classification report for "+str(model)
    print classification_report(y_true, y_pred, target_names=target_names)
    #print "Confusion matrix for "+str(model)
    #pprint.pprint(confusion_matrix(y_true, y_pred))
    plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))
    
    
# We also form the cross validation samples and run the ML algorithms in separate functions, to use if desired.
#    This is so that: a) cross validation samples aren't unnecessarily regenerated for each algorithm
#                     b) the comparison between the models is for exactly the same training and test data sets
#                        (accuracies for individual k-folds can then be compared)

def form_cross_val_set(X,y,kfolds):
    """Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
    assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
    if (kfolds==1): 
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        #X_train, X_test = X[train_idx], X[test_idx]
        #y_train, y_test = y[train_idx], y[test_idx]
        #train_idx,test_idx = train_test_split(range(0,len(y)-1))
        #cross_val = zip(train_idx,test_idx)
        cross_val = [[0,0]]
        cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
    elif (kfolds>=2):
            #cross_val = KFold(len(y),kfolds)
            cross_val = StratifiedKFold(y,kfolds)
    return cross_val
            
def run_model(cross_val_object, model,**kwargs):        
    y_true = []
    y_pred = []
    k_fold_acc = []
    for train_idx, test_idx in cross_val_object:
        #select training and test data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        #train scaler and apply to both sets
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        clf = model(**kwargs)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        y_true = np.append(y_true, y_test)
        y_pred = np.append(y_pred, pred)
        k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
        print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
    print "Mean accuracy of "+str(model)+" over all k-folds: %2.3f" % np.mean(k_fold_acc)
    print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
    print "Classification report for "+str(model)
    print classification_report(y_true, y_pred, target_names=target_names)
    #print "Confusion matrix for "+str(model)
    #pprint.pprint(confusion_matrix(y_true, y_pred))
    plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))

In [9]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import SGDClassifier as SGD
# Perceptron() is actually equivalent to:
#    SGDClassifier(loss=”perceptron”, eta0=1, learning_rate=”constant”, penalty=None).
from sklearn.linear_model import Perceptron
#from sklearn.svm import LinearSVC as SVM
#from sklearn.neighbors import KNeighborsClassifier as KNN
#from sklearn.linear_model import LogisticRegression

cv = form_cross_val_set(X=X,y=y,kfolds=5)
run_model(cross_val_object=cv, model=RF, n_jobs=4)
run_model(cross_val_object=cv, model=SGD, n_jobs=4, class_weight='auto')
run_model(cross_val_object=cv, model=Perceptron, n_jobs=4, class_weight='auto')

# To run each with a different underlying cross validation data sets:
#cross_validate(X=X,y=y,kfolds=5,model=RF,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=SGD,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=Perceptron,n_jobs=4)
#cross_validate(X=X_small,y=y_small,kfolds=1,model=SVM)


Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.999
Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> in past fold: 0.958
Mean accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> over all k-folds: 0.991
For comparison, accuracy from purely random classification: 0.043
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:417: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:959: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Classification report for <class 'sklearn.ensemble.forest.RandomForestClassifier'>
                 precision    recall  f1-score   support

           back       1.00      1.00      1.00      2203
buffer_overflow       0.76      0.53      0.63        30
      ftp_write       1.00      0.25      0.40         8
   guess_passwd       1.00      0.92      0.96        53
           imap       0.80      0.67      0.73        12
        ipsweep       1.00      0.87      0.93     12481
           land       0.56      0.43      0.49        21
     loadmodule       1.00      0.11      0.20         9
       multihop       0.33      0.14      0.20         7
        neptune       1.00      0.81      0.89   1072017
           nmap       0.73      0.82      0.77      2316
         normal       0.86      1.00      0.93    972781
           perl       0.67      0.67      0.67         3
            phf       1.00      0.75      0.86         4
            pod       0.98      0.97      0.98       264
      portsweep       0.18      0.97      0.31     10413
        rootkit       0.00      0.00      0.00        10
          satan       0.79      0.99      0.88     15892
          smurf       1.00      1.00      1.00   2807886
            spy       0.00      0.00      0.00         2
       teardrop       1.00      0.99      0.99       979
    warezclient       0.98      0.90      0.94      1020
    warezmaster       1.00      0.65      0.79        20

    avg / total       0.97      0.96      0.96   4898431

Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in past fold: 0.573
Mean accuracy of <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> over all k-folds: 0.573
For comparison, accuracy from purely random classification: 0.043
Classification report for <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
                 precision    recall  f1-score   support

           back       0.00      0.00      0.00      2203
buffer_overflow       0.06      0.60      0.11        30
      ftp_write       0.00      0.00      0.00         8
   guess_passwd       0.00      0.00      0.00        53
           imap       0.00      0.00      0.00        12
        ipsweep       0.00      0.00      0.00     12481
           land       0.63      0.57      0.60        21
     loadmodule       0.00      0.00      0.00         9
       multihop       0.00      0.00      0.00         7
        neptune       0.00      0.00      0.00   1072017
           nmap       0.00      0.00      0.00      2316
         normal       0.00      0.00      0.00    972781
           perl       0.33      0.67      0.44         3
            phf       1.00      0.25      0.40         4
            pod       0.00      0.00      0.00       264
      portsweep       0.00      0.00      0.00     10413
        rootkit       0.00      0.00      0.00        10
          satan       0.00      0.00      0.00     15892
          smurf       0.57      1.00      0.73   2807886
            spy       0.00      0.00      0.00         2
       teardrop       1.00      0.79      0.88       979
    warezclient       0.00      0.00      0.00      1020
    warezmaster       0.00      0.00      0.00        20

    avg / total       0.33      0.57      0.42   4898431

Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.990
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.707
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.796
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.827
Accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> in past fold: 0.810
Mean accuracy of <class 'sklearn.linear_model.perceptron.Perceptron'> over all k-folds: 0.826
For comparison, accuracy from purely random classification: 0.043
Classification report for <class 'sklearn.linear_model.perceptron.Perceptron'>
                 precision    recall  f1-score   support

           back       0.04      0.34      0.07      2203
buffer_overflow       0.02      0.40      0.05        30
      ftp_write       0.00      0.25      0.00         8
   guess_passwd       0.03      0.77      0.06        53
           imap       0.00      0.33      0.01        12
        ipsweep       0.66      0.61      0.64     12481
           land       0.01      0.81      0.02        21
     loadmodule       0.00      0.00      0.00         9
       multihop       0.00      0.00      0.00         7
        neptune       0.97      0.80      0.88   1072017
           nmap       0.04      0.32      0.07      2316
         normal       0.99      0.85      0.91    972781
           perl       0.00      0.00      0.00         3
            phf       0.00      0.25      0.00         4
            pod       0.00      0.01      0.00       264
      portsweep       0.03      0.79      0.06     10413
        rootkit       0.00      0.10      0.00        10
          satan       0.78      0.74      0.76     15892
          smurf       1.00      0.80      0.89   2807886
            spy       0.00      0.00      0.00         2
       teardrop       0.67      1.00      0.80       979
    warezclient       0.01      0.10      0.01      1020
    warezmaster       0.00      0.10      0.00        20

    avg / total       0.99      0.81      0.89   4898431

<matplotlib.figure.Figure at 0x7fe621932c50>
<matplotlib.figure.Figure at 0x7fe62060dbd0>
<matplotlib.figure.Figure at 0x7fe620334350>

In [ ]:
#from sklearn.neighbors import KNeighborsClassifier #The classification algorithm
# We of course should not be testing on the training set, but we are only checking that the code runs as expected
#K = 1 # number of nearest neighbours (Normally we'd optimize this in an inner loop!)
#Fit classifier and test it on the data
#clf = KNeighborsClassifier(K)
#clf.fit(X_scaled, y_small)
#y_pred = clf.predict(X_scaled)

#calculate the percentage correctly labelled samples
#print "Accuracy: %2.3f" % accuracy_score(y_small, y_pred)

In [ ]:
# Now do it properly, with k-fold cross validation
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 1 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
#cross_val = KFold(len(y_small), 5)

# loop over fold indexes 
# train on trainingset, test on testset
# compute accuracy
y_true = []
y_pred = []
for train_idx, test_idx in cross_val:
    #select training and test data
    X_train, X_test = X_small[train_idx], X_small[test_idx]
    y_train, y_test = y_small[train_idx], y_small[test_idx]
    #train scaler and apply to both sets
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    #train and test classifier
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    y_true = np.append(y_true, y_test)
    y_pred = np.append(y_pred, pred)

print "Accuracy: %2.3f" % accuracy_score(y_true, y_pred)

In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.ensemble import RandomForestClassifier

#Define classifier
clf = RandomForestClassifier(n_estimators=10,max_features='auto',max_depth=None,n_jobs=4)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)

In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 5 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)

In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn import svm
#Define classifier
# LinearSVC(), SVC(), NuSVC()
clf = svm.LinearSVC(dual=False,fit_intercept=True)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)

In [ ]:
# Compute some evaluation metrics

print classification_report(y_true, y_pred, target_names=target_names)
print "Confusion matrix"
print confusion_matrix(y_true, y_pred)
print "For comparison, precision from purely random classification: ",1.0/float(len(target_names))

plot_confusion_matrix(y_true, y_pred, target_names)

In [ ]:
# Plot the classification boundary between two of the features
# Beware: this implies training the algorithm on the subset of features of interest, so not computationally negligible

from matplotlib.colors import ListedColormap
#import matplotlib.cm as cmx
#import matplotlib.colors as colors

# Choose size of data subset to use
entries = int(len(X)*0.01)
data_subset = data[:entries]

# Choose the two features to consider
X_plot = data_subset[["num_failed_logins","logged_in"]].values
y_plot = data_subset['Label'].values

h = .01  # step size in the plotting mesh

# Create color maps
#cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
#cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


# 128 colours, for future use if required
colrs=  ["#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
        "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
        "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
        "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
        "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
        "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
        "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
        "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",

        "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
        "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
        "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
        "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
        "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C",
        "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800",
        "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51",
        "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", "#012C58"]


# We train the algorithm and plot its decision boundaries in the two features, no testing involved

n_neighbours = 1 # Number of nearest neighbours
for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    
    scaler_plot = StandardScaler().fit(X_plot)
    X_plot = scaler_plot.transform(X_plot)
    
    # Train the model
    clf_plot = KNeighborsClassifier(n_neighbours, weights=weights)
    clf_plot.fit(X_plot, y_plot)

    # Generate a mesh [x_min, m_max]x[y_min, y_max] and predict the label for each point
    x_min, x_max = X_plot[:, 0].min() - 1, X_plot[:, 0].max() + 1
    y_min, y_max = X_plot[:, 1].min() - 1, X_plot[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf_plot.predict(np.c_[xx.ravel(), yy.ravel()])

    # Count the number of distinct classes found, use this to select the number of colours required to plot regions
    colrs_req = ListedColormap(colrs[:len(set(Z))])
    #print colrs[:len(set(Z))]
    
    # Put the result into a colour plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=colrs_req)
    
    # Map classes of points used (strings) into numbers to allow for plot function to work
    
    #for counter,i in enumerate(y_plot):

    # Plot also the training points
    plt.scatter(X_plot[:, 0], X_plot[:, 1])
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("Two-feature classification boundary (k = %i, weights = '%s')"
              % (n_neighbours, weights))

plt.show()


# In[ ]: