In [1]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import os

In [2]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')


def generate_confusion_matrix(y_test, y_pred):
    """Create the plot for our confusion matrix"""
    # y_test = joblib.load('models/rf_y_test.pkl')
    # y_pred = joblib.load('models/rf_y_pred.pkl')

    cat = y_test.value_counts().index.tolist()
    cm = confusion_matrix(y_test, y_pred, sorted(cat))

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index = [i for i in cat],
                    columns = [i for i in cat])
    plt.figure(figsize = (8,6))
    ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
    plt.setp(ax.get_yticklabels(), rotation=0)
    plt.setp(ax.get_xticklabels(), rotation=90)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
#     plt.savefig('app/static/fig/cm-10.png')

In [3]:
# Get intrusions data
intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']

In [4]:
intrusions['attack_cat'] = intrusions['attack'].astype('category')
del intrusions['attack']
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])

# # Train model
X = dummy.ix[:,(dummy.columns != 'attack_cat')]
y = dummy['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [5]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)


Out[5]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [6]:
y_pred_hard = rf.predict(X_test)

In [7]:
y_pred = rf.predict_proba(X_test)

In [8]:
rf.classes_


Out[8]:
array(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.',
       'ipsweep.', 'land.', 'loadmodule.', 'multihop.', 'neptune.',
       'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.',
       'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.',
       'warezmaster.'], dtype=object)

In [9]:
target_names = [
    'portsweep.',
    'normal.',
    'warezclient.',
    'loadmodule.',
    'ipsweep.',
    'buffer_overflow.',
    'multihop.',
    'rootkit.',
    'teardrop.',
    'satan.',
    'smurf.',
    'perl.',
    'pod.',
    'land.',
    'neptune.',
    'phf.',
    'back.',
    'guess_passwd.',
    'ftp_write.',
    'imap.',
    'nmap.',
    'warezmaster.',
    'spy.'
]

target_names = sorted(target_names)

In [10]:
target_names


Out[10]:
['back.',
 'buffer_overflow.',
 'ftp_write.',
 'guess_passwd.',
 'imap.',
 'ipsweep.',
 'land.',
 'loadmodule.',
 'multihop.',
 'neptune.',
 'nmap.',
 'normal.',
 'perl.',
 'phf.',
 'pod.',
 'portsweep.',
 'rootkit.',
 'satan.',
 'smurf.',
 'spy.',
 'teardrop.',
 'warezclient.',
 'warezmaster.']

In [11]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
    thresh_vec = np.unique(np.round(y_pred[:,i],6))
    y_test_b = (y_test == target_names[i])
    precisions = []
    recalls = []

    for th in thresh_vec:
        y_pred_b = y_pred[:,i] > th
        tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
        fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
        tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
        fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
#         print target_names[i], tp, fp, tn, fn
        precision = 1
        recall = 1
        if (tp + fp) > 0:
            precision = 1.0 * tp /  (tp + fp)
        if (tp + fn) > 0:
            recall = 1.0 * tp / (tp + fn)
        precisions.append(precision)
        recalls.append(recall)
    
    class_precisions.append(precisions)
    class_recalls.append(recalls)

In [12]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
    plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])

plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[12]:
<matplotlib.legend.Legend at 0x2dd6e48>

In [ ]:
['back.',
 'buffer_overflow.',
 'ftp_write.',
 'guess_passwd.',
 'imap.',
 'ipsweep.',
 'land.',
 'loadmodule.',
 'multihop.',
 'neptune.',
 'nmap.',
 'normal.',
 'perl.',
 'phf.',
 'pod.',
 'portsweep.',
 'rootkit.',
 'satan.',
 'smurf.',
 'spy.',
 'teardrop.',
 'warezclient.',
 'warezmaster.']

In [ ]:
guess_passwd.          3
buffer_overflow.        1
land.                   6
warezmaster.            22
imap.                   4
ftp_write.              2
multihop.               8
loadmodule.             7
rootkit.                16
perl.                   12
spy.                    19
phf.                    13

In [19]:
idx_combo = [1,2,3,4,6,7,8,12,13,16,19,22]

In [21]:
tps = []
fns = []
for i in idx_combo:
    tp = confusion_matrix(y_test, y_pred_hard)[i][i]
    fn = np.sum(confusion_matrix(y_test, y_pred_hard)[i]) - tp
    tps.append(tp)
    fns.append(fn)

In [23]:
sum(tps)


Out[23]:
35

In [24]:
sum(fns)


Out[24]:
16

In [14]:
generate_confusion_matrix(y_test, y_pred_hard)



In [ ]: