In [2]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import ml_insights as mli
%matplotlib inline

In [3]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')

intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])


# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y[:1000]
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [4]:
xg = joblib.load('models/xg_scikit_model.pkl')

# y_pred = xg.predict(X_test)
y_pred = xg.predict_proba(X_test)
# predictions = [round(value) for value in y_pred]

In [5]:
from sklearn.metrics import precision_recall_curve

In [6]:
precision, recall, thresholds=precision_recall_curve(y_test,y_pred[:,18],pos_label=19)


c:\program files\anaconda2\lib\site-packages\sklearn\metrics\ranking.py:415: RuntimeWarning: invalid value encountered in true_divide
  recall = tps / tps[-1]

In [33]:
y_train.value_counts()


Out[33]:
smurf.              196481
neptune.             75226
normal.              67995
back.                 1549
satan.                1100
ipsweep.               892
portsweep.             711
warezclient.           707
teardrop.              676
pod.                   187
nmap.                  162
guess_passwd.           41
buffer_overflow.        22
warezmaster.            15
land.                   15
imap.                    8
rootkit.                 8
loadmodule.              6
ftp_write.               4
phf.                     3
multihop.                3
perl.                    2
spy.                     1
Name: attack, dtype: int64

In [54]:
y.value_counts()


Out[54]:
smurf.              280790
neptune.            107201
normal.              97277
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: attack, dtype: int64

In [8]:
target_names = [
    'portsweep.',
    'normal.',
    'warezclient.',
    'loadmodule.',
    'ipsweep.',
    'buffer_overflow.',
    'multihop.',
    'rootkit.',
    'teardrop.',
    'satan.',
    'smurf.',
    'perl.',
    'pod.',
    'land.',
    'neptune.',
    'phf.',
    'back.',
    'guess_passwd.',
    'ftp_write.',
    'imap.',
    'nmap.',
    'warezmaster.',
    'spy.'
]

target_names = sorted(target_names)

In [ ]:
back.                 654
satan.                489
ipsweep.              355
portsweep.            329
warezclient.          313
teardrop.             303

0, 17, 18, 9, 11, 5, 15, 20, 21

In [53]:
target_names


Out[53]:
['back.',
 'buffer_overflow.',
 'ftp_write.',
 'guess_passwd.',
 'imap.',
 'ipsweep.',
 'land.',
 'loadmodule.',
 'multihop.',
 'neptune.',
 'nmap.',
 'normal.',
 'perl.',
 'phf.',
 'pod.',
 'portsweep.',
 'rootkit.',
 'satan.',
 'smurf.',
 'spy.',
 'teardrop.',
 'warezclient.',
 'warezmaster.']

In [10]:
target_names[8]


Out[10]:
'multihop.'

In [50]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
    thresh_vec = np.unique(np.round(y_pred[:,i],4))
    y_test_b = (y_test == target_names[i])
    precisions = []
    recalls = []

    for th in thresh_vec:
        y_pred_b = y_pred[:,i] > th
        tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
        fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
        tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
        fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
        precision = 1.0 * tp /  (tp + fp)
        recall = 1.0 * tp / (tp + fn)
        precisions.append(precision)
        recalls.append(recall)
    
    class_precisions.append(precisions)
    class_recalls.append(recalls)


c:\program files\anaconda2\lib\site-packages\ipykernel\__main__.py:16: RuntimeWarning: invalid value encountered in double_scalars

In [43]:
# 0, 17, 5, 15, 20, 21
i = 21
thresh_vec = np.unique(np.round(y_pred[:,i],6))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []

for th in thresh_vec:
    y_pred_b = y_pred[:,i] > th
    tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
    fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
    tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
    fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
    precision = 1.0 * tp /  (tp + fp)
    recall = 1.0 * tp / (tp + fn)
    precisions.append(precision)
    recalls.append(recall)


c:\program files\anaconda2\lib\site-packages\ipykernel\__main__.py:14: RuntimeWarning: invalid value encountered in double_scalars

In [44]:
plt.plot(recalls, precisions, label=target_names[i])

plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[44]:
<matplotlib.legend.Legend at 0xf0bdc18>

In [52]:
plt.figure(figsize=(8,6))
# for c in [2,3,6,7,8]:
for c in [0, 5, 9, 11, 15, 17, 18, 20, 21]:
    plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])

plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[52]:
<matplotlib.legend.Legend at 0x16396908>

In [40]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')

intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])


# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
y = label_encoded_y
# y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [41]:
y


Out[41]:
array([15, 15, 15, ..., 15, 15, 15], dtype=int64)

In [42]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp


# Binarize the output
y_class = []
for i in pd.Series(y).unique():
    y_class.append(i)
y_class = sorted(y_class)

y = label_binarize(y, classes=y_class)
n_classes = y.shape[1]

# shuffle and split training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

# Learn to predict each class against the other
# classifier = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.3, objective='multi:softmax', silent=True))
# eval_set = [(X_train, y_train), (X_test, y_test)]
# y_score = xgb.predict(X_test)
y_predict = xg.predict(X_test)
y_test_2 = pd.get_dummies(y_test)
y_score = pd.get_dummies(y_predict)

n_classes = len(set(y_predict))

In [43]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()


for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_2.iloc[:, i], y_score.iloc[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [44]:
%matplotlib inline
plt.figure(figsize = (10,7))
lw = 2

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'lime'])


for class_num, color in zip(range(n_classes), colors):
#     if roc_auc[class_num] <= 0.99:
    plt.plot(fpr[class_num], tpr[class_num], color=color,
             lw=lw, label='ROC curve %s (area = %0.2f)' % (class_num, roc_auc[class_num]))


plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")


Out[44]:
<matplotlib.legend.Legend at 0x155b5080>

In [46]:
def generate_confusion_matrix(y_test, y_pred, labels):
    """Create the plot for our confusion matrix"""
    # y_test = joblib.load('models/rf_y_test.pkl')
    # y_pred = joblib.load('models/rf_y_pred.pkl')

    cm = confusion_matrix(y_test, y_pred)
    cat = labels

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    df_cm = pd.DataFrame(cm, index = [i for i in cat],
                    columns = [i for i in cat])
    plt.figure(figsize = (8,6))
    ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
    plt.setp(ax.get_yticklabels(), rotation=0)
    plt.setp(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()

In [47]:
# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
y_pred_cm = xg.predict(X_test)
generate_confusion_matrix(y_test, y_pred_cm, label_key)



In [21]:
class_precisions = []
class_recalls = []
class_tprs = []
class_fprs = []
for i in range(len(target_names)):
    thresh_vec = np.unique(np.round(y_pred[:,i],6))
    y_test_b = (y_test == target_names[i])
    precisions = []
    recalls = []
    fprs = []
    tprs = []

    for th in thresh_vec:
        y_pred_b = y_pred[:,i] > th
        tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
        fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
        tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
        fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
        precision = 1.0 * tp /  (tp + fp)
        recall = 1.0 * tp / (tp + fn)
        fpr = 1.0 * fp / (fp + tn)
        tpr = 1.0 * tp / (tp + fn)
        precisions.append(precision)
        recalls.append(recall)
        fprs.append(fpr)
        tprs.append(tpr)
    class_precisions.append(precisions)
    class_recalls.append(recalls)
    class_fprs.append(fprs)
    class_tprs.append(tprs)


c:\program files\anaconda2\lib\site-packages\ipykernel\__main__.py:19: RuntimeWarning: invalid value encountered in double_scalars

In [22]:
plt.figure(figsize=(8,6))
for c in [2,3,6,7,8]:
    plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])

plt.title('Precision-Recall Curve for XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[22]:
<matplotlib.legend.Legend at 0x14395b38>
c:\program files\anaconda2\lib\site-packages\matplotlib\transforms.py:660: RuntimeWarning: invalid value encountered in absolute
  inside = ((abs(dx0 + dx1) + abs(dy0 + dy1)) == 0)

In [20]:
fig, ax = plt.subplots(figsize = (8,6))
for c in range(len(class_fprs)):
    ax.plot(class_fprs[c], class_tprs[c], label=target_names[c])

# ax.set_xscale('log')
ax.set_title('ROC Curve on XGBoost')
ax.set_xlabel('False Positive Rate:    FP / (FP + TN) ')
ax.set_ylabel('True Positive Rate:    TP / (TP + FN) ')
# ax.set_ylim(0.996,1.0001)
# ax.set_xlim(0.00001,0.0004)
# ax.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[20]:
<matplotlib.text.Text at 0x161ca7f0>

In [19]:
fig, ax = plt.subplots(figsize = (8,6))
for c in [11]:
    ax.plot(class_fprs[c], class_tprs[c], label=target_names[c])

# ax.set_xscale('log')
ax.set_title('ROC Curve for Normal Traffic on XGBoost')
ax.set_xlabel('False Positive Rate:    FP / (FP + TN) ')
ax.set_ylabel('True Positive Rate:    TP / (TP + FN) ')
ax.set_ylim(0.996,1.0001)
ax.set_xlim(0.00001,0.0004)
# ax.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[19]:
(1e-05, 0.0004)

In [63]:
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(fprs, tprs)
# ax.set_yscale('log')
ax.set_xscale('log')



In [ ]: