In [1]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import os
In [2]:
def get_data_csv():
return pd.read_csv('intrusions.csv')
def generate_confusion_matrix(y_test, y_pred):
"""Create the plot for our confusion matrix"""
# y_test = joblib.load('models/rf_y_test.pkl')
# y_pred = joblib.load('models/rf_y_pred.pkl')
cat = y_test.value_counts().index.tolist()
cm = confusion_matrix(y_test, y_pred, sorted(cat))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm, index = [i for i in cat],
columns = [i for i in cat])
plt.figure(figsize = (8,6))
ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
plt.setp(ax.get_yticklabels(), rotation=0)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
# plt.savefig('app/static/fig/cm-10.png')
In [3]:
# Get intrusions data
intrusions = get_data_csv()
# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
In [4]:
intrusions['attack_cat'] = intrusions['attack'].astype('category')
del intrusions['attack']
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])
# # Train model
X = dummy.ix[:,(dummy.columns != 'attack_cat')]
y = dummy['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
In [5]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
Out[5]:
In [6]:
y_pred_hard = rf.predict(X_test)
In [7]:
y_pred = rf.predict_proba(X_test)
In [8]:
rf.classes_
Out[8]:
In [9]:
target_names = [
'portsweep.',
'normal.',
'warezclient.',
'loadmodule.',
'ipsweep.',
'buffer_overflow.',
'multihop.',
'rootkit.',
'teardrop.',
'satan.',
'smurf.',
'perl.',
'pod.',
'land.',
'neptune.',
'phf.',
'back.',
'guess_passwd.',
'ftp_write.',
'imap.',
'nmap.',
'warezmaster.',
'spy.'
]
target_names = sorted(target_names)
In [10]:
target_names
Out[10]:
In [11]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
thresh_vec = np.unique(np.round(y_pred[:,i],6))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
# print target_names[i], tp, fp, tn, fn
precision = 1
recall = 1
if (tp + fp) > 0:
precision = 1.0 * tp / (tp + fp)
if (tp + fn) > 0:
recall = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
class_precisions.append(precisions)
class_recalls.append(recalls)
In [12]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])
plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[12]:
In [ ]:
['back.',
'buffer_overflow.',
'ftp_write.',
'guess_passwd.',
'imap.',
'ipsweep.',
'land.',
'loadmodule.',
'multihop.',
'neptune.',
'nmap.',
'normal.',
'perl.',
'phf.',
'pod.',
'portsweep.',
'rootkit.',
'satan.',
'smurf.',
'spy.',
'teardrop.',
'warezclient.',
'warezmaster.']
In [ ]:
guess_passwd. 3
buffer_overflow. 1
land. 6
warezmaster. 22
imap. 4
ftp_write. 2
multihop. 8
loadmodule. 7
rootkit. 16
perl. 12
spy. 19
phf. 13
In [19]:
idx_combo = [1,2,3,4,6,7,8,12,13,16,19,22]
In [21]:
tps = []
fns = []
for i in idx_combo:
tp = confusion_matrix(y_test, y_pred_hard)[i][i]
fn = np.sum(confusion_matrix(y_test, y_pred_hard)[i]) - tp
tps.append(tp)
fns.append(fn)
In [23]:
sum(tps)
Out[23]:
In [24]:
sum(fns)
Out[24]:
In [14]:
generate_confusion_matrix(y_test, y_pred_hard)
In [ ]: