In [2]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import ml_insights as mli
%matplotlib inline
In [3]:
def get_data_csv():
return pd.read_csv('intrusions.csv')
intrusions = get_data_csv()
# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])
# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y[:1000]
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
In [4]:
xg = joblib.load('models/xg_scikit_model.pkl')
# y_pred = xg.predict(X_test)
y_pred = xg.predict_proba(X_test)
# predictions = [round(value) for value in y_pred]
In [5]:
from sklearn.metrics import precision_recall_curve
In [6]:
precision, recall, thresholds=precision_recall_curve(y_test,y_pred[:,18],pos_label=19)
In [33]:
y_train.value_counts()
Out[33]:
In [54]:
y.value_counts()
Out[54]:
In [8]:
target_names = [
'portsweep.',
'normal.',
'warezclient.',
'loadmodule.',
'ipsweep.',
'buffer_overflow.',
'multihop.',
'rootkit.',
'teardrop.',
'satan.',
'smurf.',
'perl.',
'pod.',
'land.',
'neptune.',
'phf.',
'back.',
'guess_passwd.',
'ftp_write.',
'imap.',
'nmap.',
'warezmaster.',
'spy.'
]
target_names = sorted(target_names)
In [ ]:
back. 654
satan. 489
ipsweep. 355
portsweep. 329
warezclient. 313
teardrop. 303
0, 17, 18, 9, 11, 5, 15, 20, 21
In [53]:
target_names
Out[53]:
In [10]:
target_names[8]
Out[10]:
In [50]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
thresh_vec = np.unique(np.round(y_pred[:,i],4))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
precision = 1.0 * tp / (tp + fp)
recall = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
class_precisions.append(precisions)
class_recalls.append(recalls)
In [43]:
# 0, 17, 5, 15, 20, 21
i = 21
thresh_vec = np.unique(np.round(y_pred[:,i],6))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
precision = 1.0 * tp / (tp + fp)
recall = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
In [44]:
plt.plot(recalls, precisions, label=target_names[i])
plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[44]:
In [52]:
plt.figure(figsize=(8,6))
# for c in [2,3,6,7,8]:
for c in [0, 5, 9, 11, 15, 17, 18, 20, 21]:
plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])
plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[52]:
In [40]:
def get_data_csv():
return pd.read_csv('intrusions.csv')
intrusions = get_data_csv()
# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])
# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
y = label_encoded_y
# y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
In [41]:
y
Out[41]:
In [42]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
# Binarize the output
y_class = []
for i in pd.Series(y).unique():
y_class.append(i)
y_class = sorted(y_class)
y = label_binarize(y, classes=y_class)
n_classes = y.shape[1]
# shuffle and split training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
# Learn to predict each class against the other
# classifier = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.3, objective='multi:softmax', silent=True))
# eval_set = [(X_train, y_train), (X_test, y_test)]
# y_score = xgb.predict(X_test)
y_predict = xg.predict(X_test)
y_test_2 = pd.get_dummies(y_test)
y_score = pd.get_dummies(y_predict)
n_classes = len(set(y_predict))
In [43]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_2.iloc[:, i], y_score.iloc[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
In [44]:
%matplotlib inline
plt.figure(figsize = (10,7))
lw = 2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'lime'])
for class_num, color in zip(range(n_classes), colors):
# if roc_auc[class_num] <= 0.99:
plt.plot(fpr[class_num], tpr[class_num], color=color,
lw=lw, label='ROC curve %s (area = %0.2f)' % (class_num, roc_auc[class_num]))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
Out[44]:
In [46]:
def generate_confusion_matrix(y_test, y_pred, labels):
"""Create the plot for our confusion matrix"""
# y_test = joblib.load('models/rf_y_test.pkl')
# y_pred = joblib.load('models/rf_y_pred.pkl')
cm = confusion_matrix(y_test, y_pred)
cat = labels
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm, index = [i for i in cat],
columns = [i for i in cat])
plt.figure(figsize = (8,6))
ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
plt.setp(ax.get_yticklabels(), rotation=0)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.tight_layout()
In [47]:
# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
y_pred_cm = xg.predict(X_test)
generate_confusion_matrix(y_test, y_pred_cm, label_key)
In [21]:
class_precisions = []
class_recalls = []
class_tprs = []
class_fprs = []
for i in range(len(target_names)):
thresh_vec = np.unique(np.round(y_pred[:,i],6))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
fprs = []
tprs = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
precision = 1.0 * tp / (tp + fp)
recall = 1.0 * tp / (tp + fn)
fpr = 1.0 * fp / (fp + tn)
tpr = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
fprs.append(fpr)
tprs.append(tpr)
class_precisions.append(precisions)
class_recalls.append(recalls)
class_fprs.append(fprs)
class_tprs.append(tprs)
In [22]:
plt.figure(figsize=(8,6))
for c in [2,3,6,7,8]:
plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])
plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[22]:
In [20]:
fig, ax = plt.subplots(figsize = (8,6))
for c in range(len(class_fprs)):
ax.plot(class_fprs[c], class_tprs[c], label=target_names[c])
# ax.set_xscale('log')
ax.set_title('ROC Curve on XGBoost')
ax.set_xlabel('False Positive Rate: FP / (FP + TN) ')
ax.set_ylabel('True Positive Rate: TP / (TP + FN) ')
# ax.set_ylim(0.996,1.0001)
# ax.set_xlim(0.00001,0.0004)
# ax.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[20]:
In [19]:
fig, ax = plt.subplots(figsize = (8,6))
for c in [11]:
ax.plot(class_fprs[c], class_tprs[c], label=target_names[c])
# ax.set_xscale('log')
ax.set_title('ROC Curve for Normal Traffic on XGBoost')
ax.set_xlabel('False Positive Rate: FP / (FP + TN) ')
ax.set_ylabel('True Positive Rate: TP / (TP + FN) ')
ax.set_ylim(0.996,1.0001)
ax.set_xlim(0.00001,0.0004)
# ax.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[19]:
In [63]:
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(fprs, tprs)
# ax.set_yscale('log')
ax.set_xscale('log')
In [ ]: