In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os,sys
#import csv
import pandas as pan
import cPickle as pickle
import pprint
#import timeit
get_ipython().magic(u'matplotlib inline')
#import tables #PyTables used to generate HDF5 file instead of pickle
In [2]:
rootdir="/home/ilan/Desktop/KDD_1999/"
os.chdir(rootdir)
datadir="/home/ilan/Desktop/KDD_1999/KDD_1999_data/"
filename = 'kddcup.data.corrected'
#filepath = os.path.join(rootdir,filename)
#try:
# with open(filename,'r') as f:
# data = csv.reader(f)
#except csv.Error as err:
# print ("Loading csv file failed",format(err))
# sys.exit('File %s, line %d: %s' % (filename, reader.line_num, err))
#type(data)
# f4=32bit float, f8=64bit float, i4=32bit integer, i8=64bit integer, a10=10 character string
#coltypes = np.dtype("f8, a10, a10, a10, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, \
# i8, i8, i8, i8, i8, i8, i8, f8, f8, f8, f8, f8, f8, f8, i8, i8, f8, f8, \
# f8, f8, f8, f8, f8, f8, a10")
# np.genfromtxt can handle missing data but runs much slower and uses much more RAM
#data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
#data = np.loadtxt(filename, dtype=coltypes, delimiter=',')
pklfile="data.pkl"
#hffile="data.h5"
folderpath=os.path.join(datadir,pklfile)
#folderpath=os.path.join(rootdir,hffile)
if (os.path.exists(folderpath)==True):
print("Pickle file containing data found. Loading it...")
data=pickle.load(open(folderpath,'r'))
#data = tables.open_file(folderpath, driver="H5FD_CORE")
else:
# np.genfromtxt can handle missing data but runs much slower and uses much more RAM
#data = np.genfromtxt(filename, dtype=coltypes, delimiter=',', skip_header=0)
#data = np.loadtxt(filename, dtype=coltypes, delimiter=',')
columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
"urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
"num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
"rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
"dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
"dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
"dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","Label"]
# Must disable header which reads column names from a given row in the input file for names flag to work
# index_col must be set to false otherwise it thinks the first column is the index
print("Reading in csv file and creating pickle...")
data = pan.read_csv(filename, delimiter=',',header=None,names=columns,index_col=False)
# HDF5 unfortunately only supports standard Python/Numpy types, not Pandas DataFrames
#h5file = tables.openFile(hffile, mode='w', driver="H5FD_CORE")
#h5file.createArray(h5file.root, "array", data)
#h5file.close()
# Remove period in the label column
data['Label']=data['Label'].apply(lambda x: x.strip('.'))
with open(pklfile,'wb') as output:
pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)
data
# This custom function to read in csv files is fastest and requires the least RAM, but requires
# perfect input data as no allowances for missing or imperfect data are made
#def custom_loadtxt(filename, delimiter=',', skiprows=0, dtype=float):
# def iter_func():
# with open(filename, 'r') as infile:
# for _ in range(skiprows):
# next(infile)
# for line in infile:
# line = line.rstrip().split(delimiter)
# for counter,item in enumerate(line):
# yield dtype[counter](item)
# custom_loadtxt.rowlength = len(line)
# data = np.fromiter(iter_func(), dtype=dtype)
# data = data.reshape((-1, custom_loadtxt.rowlength))
# return data
#filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
#with open(filename1,'wb') as output:
# pickle.dump(corpustopics, output)
#topics=pickle.load(open(topicspath,'r'))
Out[2]:
In [3]:
# All the labels in the data, and their counts
labelcounts=data['Label'].value_counts()
print labelcounts
labelcounts.plot(kind='bar')
Out[3]:
In [4]:
# Form the Numpy arrays that scikit-learn requires as input for its classification algorithms
# Internally, Pandas DataFrames are Numpy arrays, use the .values suffix to get these
# X will be the (transaction x features) array (excluding the labels, those will go in y)
# We will exclude features which are strings ("protocol_type","service","flag") for now
X = data[["duration","src_bytes","dst_bytes","land","wrong_fragment",
"urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
"num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
"rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
"dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
"dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
"dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"]].values
# y will be the (transactions x 1) vector with the label of each transaction
y = data['Label'].values
# target_names will be the vector with the possible labels/classes
target_names = ["back","buffer_overflow","ftp_write","guess_passwd","imap","ipsweep","land","loadmodule",
"multihop","neptune","nmap","normal","perl","phf","pod","portsweep","rootkit","satan",
"smurf","spy","teardrop","warezclient","warezmaster"]
In [5]:
print "Size of X (nsamples, nfeatures):", X.shape
print "Excerpt of X"
print X[:10,:10]
print "Size of y (nsamples):", y.shape
print "Class names:", target_names
In [6]:
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from random import sample
# Form a subset of the data by sampling without replacement, to explore model performances with limited computation
# (and also, using the full data set gives a Memory Error)
# number of entries corresponding to 1%
ents = int(len(X)*0.1)
# To take the first elements:
#X_small = X[:ents]
#y_small = y[:ents]
# Take a random sample of size 1% from the data
xs = sample(range(0,len(X)-1),ents)
X_small=X[xs]
y_small=y[xs]
distinct, counts = np.unique(y_small, return_counts=True)
pairs = zip(distinct,counts)
#print sorted(pairs,key=lambda x: x[1],reverse=True)
#print np.asarray((distinct, counts)).T
fulldistinct, fullcounts = np.unique(y, return_counts=True)
fullpairs = zip(fulldistinct, fullcounts)
#print sorted(fullpairs, key=lambda x: x[1], reverse=True)
#comparet=[[types1, fullcount, count] for (types1, fullcount) in fullpairs
# for (types2, count) in pairs if types1==types2]
#comparet = np.concatenate((list(fullpairs),list(pairs[:,1])),axis=1)
#print sorted(comparet, key=lambda x: x[1], reverse=True)
# For each class show count in full data set, in sampled subset, and % in full data set and in subset
# This is done to ensure that our sample contains all the main classes of the full data set,
# in roughly the same proportions
# If so we can proceed with this sample
# and have some confidence that the relative accuracies of the models in the cross validation
# will be representative of those which would be obtained from the full training set
comparet1=[[types1, fullcount, count, "%.2G" % (float(fullcount)/float(len(y))*100.)+'%', "%.2G" % (float(count)/float(len(y_small))*100.)+'%']
for (types1, fullcount) in fullpairs
for (types2, count) in pairs if types1==types2]
pprint.pprint(sorted(comparet1, key=lambda x: x[1], reverse=True))
#train scaler and apply to data
scaler = StandardScaler()
scaler.fit(X_small)
X_scaled = scaler.transform(X_small)
In [7]:
# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, target_names):
ncls = len(target_names)
cm = confusion_matrix(y_true, y_pred)
norm_conf = []
for i in cm:
#print i
a = 0
tmp_arr = []
a = sum(i, 0)
for j in i:
if (a==0): tmp_arr.append(0)
elif (a>0): tmp_arr.append(float(j)/float(a))
norm_conf.append(tmp_arr)
fig = plt.figure(figsize=(12,12))
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet,
interpolation='nearest')
width = len(cm)
height = len(cm[0])
for x in xrange(ncls):
for y in xrange(ncls):
ax.annotate(str(cm[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center',
fontsize=8)
cb = fig.colorbar(res)
plt.imshow(cm, cmap='jet')
plt.xticks(np.arange(ncls), target_names, fontsize=16,rotation='vertical')
plt.yticks(np.arange(ncls), target_names, fontsize=16)
plt.xlabel("Predicted label", fontsize=20)
plt.ylabel("True label", fontsize=20)
plt.title("Confusion matrix", fontsize=22)
fig=plt.figure()
return fig
In [8]:
# Prepare for cross validation runs
from sklearn.preprocessing import StandardScaler #transform data to 0 mean and unit variance (needed for many classifiers)
from sklearn.metrics import accuracy_score #Accuracy metric (fraction correct)
from sklearn.cross_validation import KFold # Cross validation
from sklearn.cross_validation import StratifiedKFold # Cross validation
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report
#cross_val = KFold(len(y), 5)
from sklearn.cross_validation import train_test_split
def cross_validate(X,y,kfolds,model,**kwargs):
"""Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
if (kfolds==1):
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#X_train, X_test = X[train_idx], X[test_idx]
#y_train, y_test = y[train_idx], y[test_idx]
#train_idx,test_idx = train_test_split(range(0,len(y)-1))
#cross_val = zip(train_idx,test_idx)
cross_val = [[0,0]]
cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
elif (kfolds>=2):
#cross_val = KFold(len(y),kfolds)
cross_val = StratifiedKFold(y,kfolds)
y_true = []
y_pred = []
k_fold_acc = []
for train_idx, test_idx in cross_val:
#select training and test data
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
#train scaler and apply to both sets
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
clf = model(**kwargs)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
y_true = np.append(y_true, y_test)
y_pred = np.append(y_pred, pred)
k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
print "Mean accuracy of "+str(model)+" over "+str(kfolds)+" k-folds: %2.3f" % np.mean(k_fold_acc)
print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
print "Classification report for "+str(model)
print classification_report(y_true, y_pred, target_names=target_names)
#print "Confusion matrix for "+str(model)
#pprint.pprint(confusion_matrix(y_true, y_pred))
plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))
# We also form the cross validation samples and run the ML algorithms in separate functions, to use if desired.
# This is so that: a) cross validation samples aren't unnecessarily regenerated for each algorithm
# b) the comparison between the models is for exactly the same training and test data sets
# (accuracies for individual k-folds can then be compared)
def form_cross_val_set(X,y,kfolds):
"""Run cross validation of the model on the feature matrix X, with label vector y, using stratified kfolds"""
assert(kfolds>=1), "Invalid number of kfolds entered, must be int >= 1"
if (kfolds==1):
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#X_train, X_test = X[train_idx], X[test_idx]
#y_train, y_test = y[train_idx], y[test_idx]
#train_idx,test_idx = train_test_split(range(0,len(y)-1))
#cross_val = zip(train_idx,test_idx)
cross_val = [[0,0]]
cross_val[0] = train_test_split(range(0,len(y)-1), test_size=0.3)
elif (kfolds>=2):
#cross_val = KFold(len(y),kfolds)
cross_val = StratifiedKFold(y,kfolds)
return cross_val
def run_model(cross_val_object, model,**kwargs):
y_true = []
y_pred = []
k_fold_acc = []
for train_idx, test_idx in cross_val_object:
#select training and test data
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
#train scaler and apply to both sets
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
clf = model(**kwargs)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
y_true = np.append(y_true, y_test)
y_pred = np.append(y_pred, pred)
k_fold_acc = np.append(k_fold_acc, np.mean(y_pred==y_true))
print "Accuracy of "+str(model)+" in past fold: %2.3f" % np.mean(y_true==y_pred)
print "Mean accuracy of "+str(model)+" over all k-folds: %2.3f" % np.mean(k_fold_acc)
print "For comparison, accuracy from purely random classification: %2.3f" % (1.0/float(len(np.unique(y))))
print "Classification report for "+str(model)
print classification_report(y_true, y_pred, target_names=target_names)
#print "Confusion matrix for "+str(model)
#pprint.pprint(confusion_matrix(y_true, y_pred))
plot_confusion_matrix(y_true, y_pred, target_names=np.unique(y_true))
In [9]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import SGDClassifier as SGD
# Perceptron() is actually equivalent to:
# SGDClassifier(loss=”perceptron”, eta0=1, learning_rate=”constant”, penalty=None).
from sklearn.linear_model import Perceptron
#from sklearn.svm import LinearSVC as SVM
#from sklearn.neighbors import KNeighborsClassifier as KNN
#from sklearn.linear_model import LogisticRegression
cv = form_cross_val_set(X=X,y=y,kfolds=5)
run_model(cross_val_object=cv, model=RF, n_jobs=4)
run_model(cross_val_object=cv, model=SGD, n_jobs=4, class_weight='auto')
run_model(cross_val_object=cv, model=Perceptron, n_jobs=4, class_weight='auto')
# To run each with a different underlying cross validation data sets:
#cross_validate(X=X,y=y,kfolds=5,model=RF,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=SGD,n_jobs=4)
#cross_validate(X=X,y=y,kfolds=5,model=Perceptron,n_jobs=4)
#cross_validate(X=X_small,y=y_small,kfolds=1,model=SVM)
In [ ]:
#from sklearn.neighbors import KNeighborsClassifier #The classification algorithm
# We of course should not be testing on the training set, but we are only checking that the code runs as expected
#K = 1 # number of nearest neighbours (Normally we'd optimize this in an inner loop!)
#Fit classifier and test it on the data
#clf = KNeighborsClassifier(K)
#clf.fit(X_scaled, y_small)
#y_pred = clf.predict(X_scaled)
#calculate the percentage correctly labelled samples
#print "Accuracy: %2.3f" % accuracy_score(y_small, y_pred)
In [ ]:
# Now do it properly, with k-fold cross validation
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 1 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
#cross_val = KFold(len(y_small), 5)
# loop over fold indexes
# train on trainingset, test on testset
# compute accuracy
y_true = []
y_pred = []
for train_idx, test_idx in cross_val:
#select training and test data
X_train, X_test = X_small[train_idx], X_small[test_idx]
y_train, y_test = y_small[train_idx], y_small[test_idx]
#train scaler and apply to both sets
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#train and test classifier
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
y_true = np.append(y_true, y_test)
y_pred = np.append(y_pred, pred)
print "Accuracy: %2.3f" % accuracy_score(y_true, y_pred)
In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.ensemble import RandomForestClassifier
#Define classifier
clf = RandomForestClassifier(n_estimators=10,max_features='auto',max_depth=None,n_jobs=4)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)
In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn.neighbors import KNeighborsClassifier
#Define classifier
K = 5 #number of nearest neighbours
clf = KNeighborsClassifier(K)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)
In [ ]:
# We can do the same using much less code using scikit-learn's pipelines
from sklearn import svm
#Define classifier
# LinearSVC(), SVC(), NuSVC()
clf = svm.LinearSVC(dual=False,fit_intercept=True)
#Define cross-validation strategy
pipe = make_pipeline(StandardScaler(), clf)
scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cross_val)
print scores
print "Accuracy: %2.3f" % np.mean(scores)
In [ ]:
# Compute some evaluation metrics
print classification_report(y_true, y_pred, target_names=target_names)
print "Confusion matrix"
print confusion_matrix(y_true, y_pred)
print "For comparison, precision from purely random classification: ",1.0/float(len(target_names))
plot_confusion_matrix(y_true, y_pred, target_names)
In [ ]:
# Plot the classification boundary between two of the features
# Beware: this implies training the algorithm on the subset of features of interest, so not computationally negligible
from matplotlib.colors import ListedColormap
#import matplotlib.cm as cmx
#import matplotlib.colors as colors
# Choose size of data subset to use
entries = int(len(X)*0.01)
data_subset = data[:entries]
# Choose the two features to consider
X_plot = data_subset[["num_failed_logins","logged_in"]].values
y_plot = data_subset['Label'].values
h = .01 # step size in the plotting mesh
# Create color maps
#cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
#cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# 128 colours, for future use if required
colrs= ["#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
"#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
"#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
"#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
"#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
"#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
"#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
"#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
"#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
"#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
"#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
"#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
"#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C",
"#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800",
"#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51",
"#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", "#012C58"]
# We train the algorithm and plot its decision boundaries in the two features, no testing involved
n_neighbours = 1 # Number of nearest neighbours
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
scaler_plot = StandardScaler().fit(X_plot)
X_plot = scaler_plot.transform(X_plot)
# Train the model
clf_plot = KNeighborsClassifier(n_neighbours, weights=weights)
clf_plot.fit(X_plot, y_plot)
# Generate a mesh [x_min, m_max]x[y_min, y_max] and predict the label for each point
x_min, x_max = X_plot[:, 0].min() - 1, X_plot[:, 0].max() + 1
y_min, y_max = X_plot[:, 1].min() - 1, X_plot[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf_plot.predict(np.c_[xx.ravel(), yy.ravel()])
# Count the number of distinct classes found, use this to select the number of colours required to plot regions
colrs_req = ListedColormap(colrs[:len(set(Z))])
#print colrs[:len(set(Z))]
# Put the result into a colour plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=colrs_req)
# Map classes of points used (strings) into numbers to allow for plot function to work
#for counter,i in enumerate(y_plot):
# Plot also the training points
plt.scatter(X_plot[:, 0], X_plot[:, 1])
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Two-feature classification boundary (k = %i, weights = '%s')"
% (n_neighbours, weights))
plt.show()
# In[ ]: